91 files changed, 5300 insertions, 1909 deletions
diff --git a/test/Analysis/BasicAA/memset_pattern.ll b/test/Analysis/BasicAA/memset_pattern.ll
index 25bdb2e202fb..821cbdf4bb06 100644
--- a/test/Analysis/BasicAA/memset_pattern.ll
+++ b/test/Analysis/BasicAA/memset_pattern.ll
@@ -18,4 +18,4 @@ entry:
   ret i32 %l
 }
 
-declare void @memset_pattern16(i8*, i8*, i64)
+declare void @memset_pattern16(i8*, i8* readonly, i64) argmemonly
diff --git a/test/Analysis/GlobalsModRef/argmemonly-escape.ll b/test/Analysis/GlobalsModRef/argmemonly-escape.ll
deleted file mode 100644
index 64c625810af9..000000000000
--- a/test/Analysis/GlobalsModRef/argmemonly-escape.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt < %s -O1 -S -enable-non-lto-gmr=true | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-@a = internal global [3 x i32] zeroinitializer, align 4
-
-; The important thing we're checking for here is the reload of (some element of)
-; @a after the memset.
-
-; CHECK-LABEL: @main
-; CHECK: load i32, i32* getelementptr {{.*}} @a
-; CHECK-NEXT: call void @memsetp0i8i64{{.*}} @a
-; CHECK-NEXT: load i32, i32* getelementptr {{.*}} @a
-; CHECK-NEXT: call void @memsetp0i8i64A{{.*}} @a
-; CHECK-NEXT: load i32, i32* getelementptr {{.*}} @a
-; CHECK: icmp eq
-; CHECK: br i1
-
-define i32 @main() {
-entry:
-  %0 = bitcast [3 x i32]* @a to i8*
-  %1 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
-  call void @memsetp0i8i64(i8* %0, i8 0, i64 4, i32 4, i1 false)
-  %2 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
-  call void @memsetp0i8i64A(i8* %0, i8 0, i64 4, i32 4, i1 false)
-  %3 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
-  %4 = add i32 %2, %3
-  %cmp1 = icmp eq i32 %1, %4
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entr
-  call void @abort() #3
-  unreachable
-
-if.end:                                           ; preds = %entry
-  ret i32 0
-}
-
-; Function Attrs: nounwind argmemonly
-declare void @memsetp0i8i64(i8* nocapture, i8, i64, i32, i1) nounwind argmemonly
-
-; Function Attrs: nounwind inaccessiblemem_or_argmemonly
-declare void @memsetp0i8i64A(i8* nocapture, i8, i64, i32, i1) nounwind inaccessiblemem_or_argmemonly
-
-; Function Attrs: noreturn nounwind
-declare void @abort() noreturn nounwind
diff --git a/test/Analysis/GlobalsModRef/inaccessiblememonly.ll b/test/Analysis/GlobalsModRef/inaccessiblememonly.ll
new file mode 100644
index 000000000000..d7a3cfc78a33
--- /dev/null
+++ b/test/Analysis/GlobalsModRef/inaccessiblememonly.ll
@@ -0,0 +1,21 @@
+; RUN: opt -O3 -S < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @donteliminate() {
+; CHECK-LABEL: donteliminate
+; CHECK-NEXT: tail call noalias i8* @allocmemory()
+; CHECK-NEXT: tail call noalias i8* @allocmemory()
+; CHECK-NEXT: tail call noalias i8* @allocmemory()
+; CHECK-NEXT: ret void
+  %1 = tail call noalias i8* @allocmemory()
+  %2 = tail call noalias i8* @allocmemory()
+  %3 = tail call noalias i8* @allocmemory()
+  ret void
+}
+
+; Function Attrs: inaccessiblememonly
+declare noalias i8* @allocmemory() #0
+
+attributes #0 = { inaccessiblememonly }
diff --git a/test/Analysis/GlobalsModRef/modreftest.ll b/test/Analysis/GlobalsModRef/modreftest.ll
index 2018b149fc06..07497705e65a 100644
--- a/test/Analysis/GlobalsModRef/modreftest.ll
+++ b/test/Analysis/GlobalsModRef/modreftest.ll
@@ -16,23 +16,3 @@ define i32 @test(i32* %P) {
 define void @doesnotmodX() {
 	ret void
 }
-
-declare void @InaccessibleMemOnlyFunc( ) #0
-declare void @InaccessibleMemOrArgMemOnlyFunc( ) #1
-
-define i32 @test2(i32* %P) {
-; CHECK:      @test2
-; CHECK-NEXT: store i32 12, i32* @X
-; CHECK-NEXT: call void @InaccessibleMemOnlyFunc()
-; CHECK-NEXT: call void @InaccessibleMemOrArgMemOnlyFunc()
-; CHECK-NOT:  load i32
-; CHECK-NEXT: ret i32 12
-	store i32 12, i32* @X
-	call void @InaccessibleMemOnlyFunc( )
-        call void @InaccessibleMemOrArgMemOnlyFunc( )
-	%V = load i32, i32* @X		; <i32> [#uses=1]
-	ret i32 %V
-}
-
-attributes #0 = { inaccessiblememonly }
-attributes #1 = { inaccessiblemem_or_argmemonly }
diff --git a/test/Analysis/ValueTracking/known-power-of-two.ll b/test/Analysis/ValueTracking/known-power-of-two.ll
new file mode 100644
index 000000000000..ed98a8f53616
--- /dev/null
+++ b/test/Analysis/ValueTracking/known-power-of-two.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; https://llvm.org/bugs/show_bug.cgi?id=25900
+; An arithmetic shift right of a power of two is not a power
+; of two if the original value is the sign bit. Therefore,
+; we can't transform the sdiv into a udiv.
+
+define i32 @pr25900(i32 %d) {
+  %and = and i32 %d, -2147483648
+; The next 3 lines prevent another fold from masking the bug.
+  %ext = zext i32 %and to i64
+  %or = or i64 %ext, 4294967296
+  %trunc = trunc i64 %or to i32
+  %ashr = ashr exact i32 %trunc, 31
+  %div = sdiv i32 4, %ashr
+  ret i32 %div
+
+; CHECK: sdiv
+}
+
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index 31e501de0a11..9363f503be5c 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -859,17 +859,23 @@ catchpad:
   ; CHECK-NEXT: br label %body
 
 body:
-  invoke void @f.ccc() to label %continue unwind label %terminate
+  invoke void @f.ccc() to label %continue unwind label %terminate.inner
   catchret from %catch to label %return
   ; CHECK: catchret from %catch to label %return
 
 return:
   ret i32 0
 
+terminate.inner:
+  cleanuppad within %catch []
+  unreachable
+  ; CHECK: cleanuppad within %catch []
+  ; CHECK-NEXT: unreachable
+
 terminate:
-  cleanuppad within %cs []
+  cleanuppad within none []
   unreachable
-  ; CHECK: cleanuppad within %cs []
+  ; CHECK: cleanuppad within none []
   ; CHECK-NEXT: unreachable
 
 continue:
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 994a9956cf7f..921cf6a6f0d1 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func30
-;CHECK: movi.4h v1, #0x1
-;CHECK: and.8b v0, v0, v1
-;CHECK: ushll.4s  v0, v0, #0
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_30 = type <4 x i1>
-%T1_30 = type <4 x i32>
-define void @func30(%T0_30 %v0, %T1_30* %p1) {
-  %r = zext %T0_30 %v0 to %T1_30
-  store %T1_30 %r, %T1_30* %p1
-  ret void
-}
-
-; Extend from v1i1 was crashing things (PR20791). Make sure we do something
-; sensible instead.
-define <1 x i32> @autogen_SD7918() {
-; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
-; CHECK-NEXT: ret
-  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
-  %ZE = zext <1 x i1> %I29 to <1 x i32>
-  ret <1 x i32> %ZE
-}
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: movi.4h v1, #0x1
+;CHECK: and.8b v0, v0, v1
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+  %ZE = zext <1 x i1> %I29 to <1 x i32>
+  ret <1 x i32> %ZE
+}
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index a8399f92ebe4..9c2a4fd55d1b 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m1 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index a397c339a2d7..c2721e70190a 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
 declare void @f(%X*)
diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
index 8863f70444d1..2099333950ea 100644
--- a/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -256,3 +256,106 @@ if.then:
 if.end:
   ret void
 }
+
+define void @test14(i1 %cond) {
+; CHECK-LABEL: @test14
+  br i1 %cond, label %if.end, label %if.then
+
+; CHECK-NOT: and
+; CHECK: tbnz w0, #0
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test15(i1 %cond) {
+; CHECK-LABEL: @test15
+  %cond1 = xor i1 %cond, -1
+  br i1 %cond1, label %if.then, label %if.end
+
+; CHECK-NOT: movn
+; CHECK: tbnz w0, #0
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test16(i64 %in) {
+; CHECK-LABEL: @test16
+  %shl = shl i64 %in, 3
+  %and = and i64 %shl, 32
+  %cond = icmp eq i64 %and, 0
+  br i1 %cond, label %then, label %end
+
+; CHECK-NOT: lsl
+; CHECK: tbnz w0, #2
+
+then:
+  call void @t()
+  br label %end
+
+end:
+  ret void
+}
+
+define void @test17(i64 %in) {
+; CHECK-LABEL: @test17
+  %shr = ashr i64 %in, 3
+  %and = and i64 %shr, 1
+  %cond = icmp eq i64 %and, 0
+  br i1 %cond, label %then, label %end
+
+; CHECK-NOT: lsr
+; CHECK: tbnz w0, #3
+
+then:
+  call void @t()
+  br label %end
+
+end:
+  ret void
+}
+
+define void @test18(i32 %in) {
+; CHECK-LABEL: @test18
+  %shr = ashr i32 %in, 2
+  %cond = icmp sge i32 %shr, 0
+  br i1 %cond, label %then, label %end
+
+; CHECK-NOT: asr
+; CHECK: tbnz w0, #31
+
+then:
+  call void @t()
+  br label %end
+
+end:
+  ret void
+}
+
+define void @test19(i64 %in) {
+; CHECK-LABEL: @test19
+  %shl = lshr i64 %in, 3
+  %trunc = trunc i64 %shl to i32
+  %and = and i32 %trunc, 1
+  %cond = icmp eq i32 %and, 0
+  br i1 %cond, label %then, label %end
+
+; CHECK-NOT: ubfx
+; CHECK: tbnz w0, #3
+
+then:
+  call void @t()
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index e2ae3353ae1d..9aea7c773431 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,8 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI
+; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI --check-prefix=NO-XNACK
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI --check-prefix=NO-XNACK
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=XNACK
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
-; GCN: ; NumSgprs: 8
+; NO-XNACK: ; NumSgprs: 8
+; XNACK: ; NumSgprs: 12
 define void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7}"()
@@ -10,7 +12,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}vcc_no_flat:
-; GCN: ; NumSgprs: 10
+; NO-XNACK: ; NumSgprs: 10
+; XNACK: ; NumSgprs: 12
 define void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC}"()
@@ -19,7 +22,8 @@ entry:
 
 ; GCN-LABEL: {{^}}no_vcc_flat:
 ; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 14
+; VI: ; NumSgprs: 12
+; XNACK: ; NumSgprs: 14
 define void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
@@ -28,7 +32,8 @@ entry:
 
 ; GCN-LABEL: {{^}}vcc_flat:
 ; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 14
+; VI: ; NumSgprs: 12
+; XNACK: ; NumSgprs: 14
 define void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 8347b8c96ec4..84380b421051 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
 
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index 141ee2560152..b6f8093313cb 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
index 6a04261fe47b..6486c6ab2ffc 100644
--- a/test/CodeGen/AMDGPU/load.ll
+++ b/test/CodeGen/AMDGPU/load.ll
@@ -1,7 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck --check-prefix=FUNC --check-prefix=CI-HSA --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
 
 ;===------------------------------------------------------------------------===;
 ; GLOBAL ADDRESS SPACE
@@ -11,7 +12,8 @@
 ; FUNC-LABEL: {{^}}load_i8:
 ; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI: buffer_load_ubyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; CI-HSA: flat_load_ubyte
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8, i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
@@ -23,7 +25,8 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; R600: 8
-; SI: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; CI-HSA: flat_load_sbyte
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = load i8, i8 addrspace(1)* %in
@@ -35,8 +38,10 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2i8:
 ; R600: VTX_READ_8
 ; R600: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
 define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
@@ -53,8 +58,10 @@ entry:
 ; R600-DAG: 8
 ; R600-DAG: 8
 
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
 define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
@@ -68,10 +75,14 @@ entry:
 ; R600: VTX_READ_8
 ; R600: VTX_READ_8
 ; R600: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
 define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
@@ -93,10 +104,14 @@ entry:
 ; R600-DAG: 8
 ; R600-DAG: 8
 ; R600-DAG: 8
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
 define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
@@ -108,7 +123,8 @@ entry:
 ; Load an i16 value from the global address space.
 ; FUNC-LABEL: {{^}}load_i16:
 ; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16	, i16	 addrspace(1)* %in
@@ -121,7 +137,8 @@ entry:
 ; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; R600: 16
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16, i16 addrspace(1)* %in
@@ -133,8 +150,10 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2i16:
 ; R600: VTX_READ_16
 ; R600: VTX_READ_16
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -150,8 +169,10 @@ entry:
 ; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
 ; R600-DAG: 16
 ; R600-DAG: 16
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -165,10 +186,14 @@ entry:
 ; R600: VTX_READ_16
 ; R600: VTX_READ_16
 ; R600: VTX_READ_16
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
 define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -190,10 +215,14 @@ entry:
 ; R600-DAG: 16
 ; R600-DAG: 16
 ; R600-DAG: 16
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
 define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -206,7 +235,8 @@ entry:
 ; FUNC-LABEL: {{^}}load_i32:
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
+; CI-HSA: flat_load_dword
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
@@ -218,7 +248,8 @@ entry:
 ; FUNC-LABEL: {{^}}load_f32:
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
+; CI-HSA: flat_load_dword
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
@@ -230,7 +261,8 @@ entry:
 ; FUNC-LABEL: {{^}}load_v2f32:
 ; R600: MEM_RAT
 ; R600: VTX_READ_64
-; SI: buffer_load_dwordx2
+; SI-NOHSA: buffer_load_dwordx2
+; CI-HSA: flat_load_dwordx2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
   %0 = load <2 x float>, <2 x float> addrspace(1)* %in
@@ -240,7 +272,8 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i64:
 ; R600: VTX_READ_64
-; SI: buffer_load_dwordx2
+; SI-NOHSA: buffer_load_dwordx2
+; CI-HSA: flat_load_dwordx2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
@@ -253,7 +286,8 @@ entry:
 ; R600: MEM_RAT
 ; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
 ; R600: 31
-; SI: buffer_load_dword
+; SI-NOHSA: buffer_load_dword
+; CI-HSA: flat_load_dword
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -278,8 +312,10 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -293,10 +329,14 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
@@ -313,7 +353,8 @@ entry:
 ; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; R600: 8
-; SI: buffer_load_sbyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_sbyte v{{[0-9]+}},
+; CI-HSA: flat_load_sbyte v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8, i8 addrspace(2)* %in
@@ -325,7 +366,8 @@ entry:
 ; Load an aligned i8 value
 ; FUNC-LABEL: {{^}}load_const_i8_aligned:
 ; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; CI-HSA: flat_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8, i8 addrspace(2)* %in
@@ -337,7 +379,8 @@ entry:
 ; Load an un-aligned i8 value
 ; FUNC-LABEL: {{^}}load_const_i8_unaligned:
 ; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; CI-HSA: flat_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
@@ -352,7 +395,8 @@ entry:
 ; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; R600: 16
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16, i16 addrspace(2)* %in
@@ -364,7 +408,8 @@ entry:
 ; Load an aligned i16 value
 ; FUNC-LABEL: {{^}}load_const_i16_aligned:
 ; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16, i16 addrspace(2)* %in
@@ -376,7 +421,8 @@ entry:
 ; Load an un-aligned i16 value
 ; FUNC-LABEL: {{^}}load_const_i16_unaligned:
 ; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index a30c25e700ab..551f34339a12 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 declare i32 @llvm.r600.read.tidig.y() #0
@@ -18,8 +19,10 @@ declare i32 @llvm.r600.read.tidig.y() #0
 
 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
 ; instructions
-; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
-; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
+; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 
 define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
@@ -50,8 +53,10 @@ done:                                             ; preds = %loop
 ; Test moving an SMRD instruction to the VALU
 
 ; GCN-LABEL: {{^}}smrd_valu:
+; FIXME: We should be using flat load for HSA.
 ; GCN: buffer_load_dword [[OUT:v[0-9]+]]
-; GCN: buffer_store_dword [[OUT]]
+; GCN-NOHSA: buffer_store_dword [[OUT]]
+; GCN-HSA: flat_store_dword [[OUT]]
 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
@@ -77,8 +82,9 @@ endif:                                            ; preds = %else, %if
 ; Test moving an SMRD with an immediate offset to the VALU
 
 ; GCN-LABEL: {{^}}smrd_valu2:
-; GCN-NOT: v_add
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
+; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -91,12 +97,14 @@ entry:
 
 ; Use a big offset that will use the SMRD literal offset on CI
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset:
-; GCN-NOT: v_add
-; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
-; GCN: v_add_i32_e32
-; GCN: buffer_store_dword
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: buffer_store_dword
+; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-HSA: flat_store_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -109,13 +117,14 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx2
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx2
+; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -128,15 +137,16 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
-; GCN-NOT: v_add
-; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx4
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -152,25 +162,27 @@ entry:
 ; CI.
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
 define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -184,35 +196,40 @@ entry:
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
 
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
-
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
+
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
 
 ; GCN: s_endpgm
 define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
@@ -227,9 +244,11 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}smrd_valu2_salu_user:
-; GCN: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
-; GCN: buffer_store_dword [[ADD]]
+; GCN-NOHSA: buffer_store_dword [[ADD]]
+; GCN-HSA: flat_store_dword [[ADD]]
 define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -242,7 +261,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
+; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
 define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -254,8 +274,9 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
-; GCN-NOT: v_add
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
+; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
 define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -267,8 +288,10 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}s_load_imm_v8i32:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
@@ -280,16 +303,18 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: buffer_store_dword
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: buffer_store_dword
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
@@ -319,10 +344,14 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}s_load_imm_v16i32:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -334,26 +363,30 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: buffer_store_dword
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: buffer_store_dword
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
 define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
new file mode 100644
index 000000000000..c91a44cf60e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+
+; On Tonga and Iceland, limited SGPR availability means care must be taken to
+; allocate scratch registers correctly. Check that this test compiles without
+; error.
+; TONGA-LABEL: test
+define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+entry:
+  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %aptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
+  %a = load <256 x i32>, <256 x i32> addrspace(1)* %aptr
+  call void asm sideeffect "", "~{memory}" ()
+  %outptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
+  store <256 x i32> %a, <256 x i32> addrspace(1)* %outptr
+
+; mark 128-bit SGPR registers as used so they are unavailable for the
+; scratch resource descriptor
+  call void asm sideeffect "", "~{SGPR4},~{SGPR8},~{SGPR12},~{SGPR16},~{SGPR20},~{SGPR24},~{SGPR28}" ()
+  call void asm sideeffect "", "~{SGPR32},~{SGPR36},~{SGPR40},~{SGPR44},~{SGPR48},~{SGPR52},~{SGPR56}" ()
+  call void asm sideeffect "", "~{SGPR60},~{SGPR64},~{SGPR68}" ()
+  ret void
+}
+
+declare i32 @llvm.SI.tid() nounwind readnone
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index bf502b3ae077..a74b3e441a13 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -109,6 +109,9 @@
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 | FileCheck %s --check-prefix=EXYNOS-M1
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-M1-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A-FAST
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s  --check-prefix=CORTEX-A7-CHECK
@@ -138,6 +141,9 @@
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=exynos-m1 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=exynos-m1 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+
 ; ARMv7a
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
@@ -1238,6 +1244,36 @@
 ; CORTEX-A72-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-A72-FAST:  .eabi_attribute 23, 1
 
+; EXYNOS-M1:  .cpu exynos-m1
+; EXYNOS-M1:  .eabi_attribute 6, 14
+; EXYNOS-M1:  .eabi_attribute 7, 65
+; EXYNOS-M1:  .eabi_attribute 8, 1
+; EXYNOS-M1:  .eabi_attribute 9, 2
+; EXYNOS-M1:  .fpu crypto-neon-fp-armv8
+; EXYNOS-M1:  .eabi_attribute 12, 3
+; EXYNOS-M1-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; EXYNOS-M1:  .eabi_attribute 20, 1
+; EXYNOS-M1:  .eabi_attribute 21, 1
+; EXYNOS-M1-NOT:  .eabi_attribute 22
+; EXYNOS-M1:  .eabi_attribute 23, 3
+; EXYNOS-M1:  .eabi_attribute 24, 1
+; EXYNOS-M1:  .eabi_attribute 25, 1
+; EXYNOS-M1-NOT:  .eabi_attribute 27
+; EXYNOS-M1-NOT:  .eabi_attribute 28
+; EXYNOS-M1:  .eabi_attribute 36, 1
+; EXYNOS-M1:  .eabi_attribute 38, 1
+; EXYNOS-M1:  .eabi_attribute 42, 1
+; EXYNOS-M1-NOT:  .eabi_attribute 44
+; EXYNOS-M15:  .eabi_attribute 68, 3
+
+; EXYNOS-M1-FAST-NOT:   .eabi_attribute 19
+;; The exynos-m1 has the ARMv8 FP unit, which always flushes preserving sign.
+; EXYNOS-M1-FAST:  .eabi_attribute 20, 2
+; EXYNOS-M1-FAST-NOT:  .eabi_attribute 21
+; EXYNOS-M1-FAST-NOT:  .eabi_attribute 22
+; EXYNOS-M1-FAST:  .eabi_attribute 23, 1
+
 ; GENERIC-FPU-VFPV3-FP16: .fpu vfpv3-fp16
 ; GENERIC-FPU-VFPV3-D16-FP16: .fpu vfpv3-d16-fp16
 ; GENERIC-FPU-VFPV3XD: .fpu vfpv3xd
diff --git a/test/CodeGen/ARM/debugtrap.ll b/test/CodeGen/ARM/debugtrap.ll
index 9ce73939ce56..3d8cdea6cdae 100644
--- a/test/CodeGen/ARM/debugtrap.ll
+++ b/test/CodeGen/ARM/debugtrap.ll
@@ -1,17 +1,17 @@
-; This test ensures the @llvm.debugtrap() call is not removed when generating
-; the 'pop' instruction to restore the callee saved registers on ARM.
-
-; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s 
-
-declare void @llvm.debugtrap() nounwind
-declare void @foo() nounwind
-
-define void @test() nounwind {
-entry:
-  ; CHECK: bl foo
-  ; CHECK-NEXT: pop
-  ; CHECK-NEXT: trap
-  call void @foo()
-  call void @llvm.debugtrap()
-  ret void
-}
+; This test ensures the @llvm.debugtrap() call is not removed when generating
+; the 'pop' instruction to restore the callee saved registers on ARM.
+
+; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s 
+
+declare void @llvm.debugtrap() nounwind
+declare void @foo() nounwind
+
+define void @test() nounwind {
+entry:
+  ; CHECK: bl foo
+  ; CHECK-NEXT: pop
+  ; CHECK-NEXT: trap
+  call void @foo()
+  call void @llvm.debugtrap()
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 75a0bc9ab6c6..901801d7dbbe 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -17,6 +17,28 @@ define i32 @load_i32_with_folded_offset(i32* %p) {
   ret i32 %t
 }
 
+; With an inbounds gep, we can fold an offset.
+
+; CHECK-LABEL: load_i32_with_folded_gep_offset:
+; CHECK: i32.load  $push0=, 24($0){{$}}
+define i32 @load_i32_with_folded_gep_offset(i32* %p) {
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
+; We can't fold a negative offset though, even with an inbounds gep.
+
+; CHECK-LABEL: load_i32_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i32.load  $push2=, 0($pop1){{$}}
+define i32 @load_i32_with_unfolded_gep_negative_offset(i32* %p) {
+  %s = getelementptr inbounds i32, i32* %p, i32 -6
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
 ; Without nuw, and even with nsw, we can't fold an offset.
 
 ; CHECK-LABEL: load_i32_with_unfolded_offset:
@@ -31,6 +53,18 @@ define i32 @load_i32_with_unfolded_offset(i32* %p) {
   ret i32 %t
 }
 
+; Without inbounds, we can't fold a gep offset.
+
+; CHECK-LABEL: load_i32_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i32.load  $push2=, 0($pop1){{$}}
+define i32 @load_i32_with_unfolded_gep_offset(i32* %p) {
+  %s = getelementptr i32, i32* %p, i32 6
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
 ; Same as above but with i64.
 
 ; CHECK-LABEL: load_i64_with_folded_offset:
@@ -45,6 +79,28 @@ define i64 @load_i64_with_folded_offset(i64* %p) {
 
 ; Same as above but with i64.
 
+; CHECK-LABEL: load_i64_with_folded_gep_offset:
+; CHECK: i64.load  $push0=, 24($0){{$}}
+define i64 @load_i64_with_folded_gep_offset(i64* %p) {
+  %s = getelementptr inbounds i64, i64* %p, i32 3
+  %t = load i64, i64* %s
+  ret i64 %t
+}
+
+; Same as above but with i64.
+
+; CHECK-LABEL: load_i64_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i64.load  $push2=, 0($pop1){{$}}
+define i64 @load_i64_with_unfolded_gep_negative_offset(i64* %p) {
+  %s = getelementptr inbounds i64, i64* %p, i32 -3
+  %t = load i64, i64* %s
+  ret i64 %t
+}
+
+; Same as above but with i64.
+
 ; CHECK-LABEL: load_i64_with_unfolded_offset:
 ; CHECK: i32.const $push0=, 24{{$}}
 ; CHECK: i32.add   $push1=, $0, $pop0{{$}}
@@ -57,6 +113,18 @@ define i64 @load_i64_with_unfolded_offset(i64* %p) {
   ret i64 %t
 }
 
+; Same as above but with i64.
+
+; CHECK-LABEL: load_i64_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i64.load  $push2=, 0($pop1){{$}}
+define i64 @load_i64_with_unfolded_gep_offset(i64* %p) {
+  %s = getelementptr i64, i64* %p, i32 3
+  %t = load i64, i64* %s
+  ret i64 %t
+}
+
 ; Same as above but with store.
 
 ; CHECK-LABEL: store_i32_with_folded_offset:
@@ -71,6 +139,28 @@ define void @store_i32_with_folded_offset(i32* %p) {
 
 ; Same as above but with store.
 
+; CHECK-LABEL: store_i32_with_folded_gep_offset:
+; CHECK: i32.store $discard=, 24($0), $pop0{{$}}
+define void @store_i32_with_folded_gep_offset(i32* %p) {
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  store i32 0, i32* %s
+  ret void
+}
+
+; Same as above but with store.
+
+; CHECK-LABEL: store_i32_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i32_with_unfolded_gep_negative_offset(i32* %p) {
+  %s = getelementptr inbounds i32, i32* %p, i32 -6
+  store i32 0, i32* %s
+  ret void
+}
+
+; Same as above but with store.
+
 ; CHECK-LABEL: store_i32_with_unfolded_offset:
 ; CHECK: i32.const $push0=, 24{{$}}
 ; CHECK: i32.add   $push1=, $0, $pop0{{$}}
@@ -83,6 +173,18 @@ define void @store_i32_with_unfolded_offset(i32* %p) {
   ret void
 }
 
+; Same as above but with store.
+
+; CHECK-LABEL: store_i32_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i32_with_unfolded_gep_offset(i32* %p) {
+  %s = getelementptr i32, i32* %p, i32 6
+  store i32 0, i32* %s
+  ret void
+}
+
 ; Same as above but with store with i64.
 
 ; CHECK-LABEL: store_i64_with_folded_offset:
@@ -97,6 +199,28 @@ define void @store_i64_with_folded_offset(i64* %p) {
 
 ; Same as above but with store with i64.
 
+; CHECK-LABEL: store_i64_with_folded_gep_offset:
+; CHECK: i64.store $discard=, 24($0), $pop0{{$}}
+define void @store_i64_with_folded_gep_offset(i64* %p) {
+  %s = getelementptr inbounds i64, i64* %p, i32 3
+  store i64 0, i64* %s
+  ret void
+}
+
+; Same as above but with store with i64.
+
+; CHECK-LABEL: store_i64_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i64_with_unfolded_gep_negative_offset(i64* %p) {
+  %s = getelementptr inbounds i64, i64* %p, i32 -3
+  store i64 0, i64* %s
+  ret void
+}
+
+; Same as above but with store with i64.
+
 ; CHECK-LABEL: store_i64_with_unfolded_offset:
 ; CHECK: i32.const $push0=, 24{{$}}
 ; CHECK: i32.add   $push1=, $0, $pop0{{$}}
@@ -109,6 +233,18 @@ define void @store_i64_with_unfolded_offset(i64* %p) {
   ret void
 }
 
+; Same as above but with store with i64.
+
+; CHECK-LABEL: store_i64_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add   $push1=, $0, $pop0{{$}}
+; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i64_with_unfolded_gep_offset(i64* %p) {
+  %s = getelementptr i64, i64* %p, i32 3
+  store i64 0, i64* %s
+  ret void
+}
+
 ; When loading from a fixed address, materialize a zero.
 
 ; CHECK-LABEL: load_i32_from_numeric_address
@@ -159,6 +295,17 @@ define i32 @load_i8_s_with_folded_offset(i8* %p) {
   ret i32 %u
 }
 
+; Fold a gep offset into a sign-extending load.
+
+; CHECK-LABEL: load_i8_s_with_folded_gep_offset:
+; CHECK: i32.load8_s $push0=, 24($0){{$}}
+define i32 @load_i8_s_with_folded_gep_offset(i8* %p) {
+  %s = getelementptr inbounds i8, i8* %p, i32 24
+  %t = load i8, i8* %s
+  %u = sext i8 %t to i32
+  ret i32 %u
+}
+
 ; Fold an offset into a zero-extending load.
 
 ; CHECK-LABEL: load_i8_u_with_folded_offset:
@@ -172,6 +319,17 @@ define i32 @load_i8_u_with_folded_offset(i8* %p) {
   ret i32 %u
 }
 
+; Fold a gep offset into a zero-extending load.
+
+; CHECK-LABEL: load_i8_u_with_folded_gep_offset:
+; CHECK: i32.load8_u $push0=, 24($0){{$}}
+define i32 @load_i8_u_with_folded_gep_offset(i8* %p) {
+  %s = getelementptr inbounds i8, i8* %p, i32 24
+  %t = load i8, i8* %s
+  %u = zext i8 %t to i32
+  ret i32 %u
+}
+
 ; Fold an offset into a truncating store.
 
 ; CHECK-LABEL: store_i8_with_folded_offset:
@@ -183,3 +341,43 @@ define void @store_i8_with_folded_offset(i8* %p) {
   store i8 0, i8* %s
   ret void
 }
+
+; Fold a gep offset into a truncating store.
+
+; CHECK-LABEL: store_i8_with_folded_gep_offset:
+; CHECK: i32.store8 $discard=, 24($0), $pop0{{$}}
+define void @store_i8_with_folded_gep_offset(i8* %p) {
+  %s = getelementptr inbounds i8, i8* %p, i32 24
+  store i8 0, i8* %s
+  ret void
+}
+
+; Fold the offsets when lowering aggregate loads and stores.
+
+; CHECK-LABEL: aggregate_load_store:
+; CHECK: i32.load  $2=, 0($0){{$}}
+; CHECK: i32.load  $3=, 4($0){{$}}
+; CHECK: i32.load  $4=, 8($0){{$}}
+; CHECK: i32.load  $push0=, 12($0){{$}}
+; CHECK: i32.store $discard=, 12($1), $pop0{{$}}
+; CHECK: i32.store $discard=, 8($1), $4{{$}}
+; CHECK: i32.store $discard=, 4($1), $3{{$}}
+; CHECK: i32.store $discard=, 0($1), $2{{$}}
+define void @aggregate_load_store({i32,i32,i32,i32}* %p, {i32,i32,i32,i32}* %q) {
+  ; volatile so that things stay in order for the tests above
+  %t = load volatile {i32,i32,i32,i32}, {i32, i32,i32,i32}* %p
+  store volatile {i32,i32,i32,i32} %t, {i32, i32,i32,i32}* %q
+  ret void
+}
+
+; Fold the offsets when lowering aggregate return values.
+
+; CHECK-LABEL: aggregate_return:
+; CHECK: i32.const   $push0=, 0{{$}}
+; CHECK: i32.store   $push1=, 12($0), $pop0{{$}}
+; CHECK: i32.store   $push2=, 8($0), $pop1{{$}}
+; CHECK: i32.store   $push3=, 4($0), $pop2{{$}}
+; CHECK: i32.store   $discard=, 0($0), $pop3{{$}}
+define {i32,i32,i32,i32} @aggregate_return() {
+  ret {i32,i32,i32,i32} zeroinitializer
+}
diff --git a/test/CodeGen/WinEH/wineh-cloning.ll b/test/CodeGen/WinEH/wineh-cloning.ll
index c13e0a163641..3c1793a3bd7f 100644
--- a/test/CodeGen/WinEH/wineh-cloning.ll
+++ b/test/CodeGen/WinEH/wineh-cloning.ll
@@ -2,6 +2,7 @@
 
 declare i32 @__CxxFrameHandler3(...)
 declare i32 @__C_specific_handler(...)
+declare void @ProcessCLRException(...)
 
 declare void @f()
 
@@ -369,6 +370,50 @@ unreachable:
   unreachable
 }
 
+define void @test14() personality void (...)* @ProcessCLRException {
+entry:
+  invoke void @f()
+    to label %cont unwind label %cleanup
+cont:
+  invoke void @f()
+    to label %exit unwind label %switch.outer
+cleanup:
+  %cleanpad = cleanuppad within none []
+  invoke void @f() [ "funclet" (token %cleanpad) ]
+    to label %cleanret unwind label %switch.inner
+switch.inner:
+  %cs.inner = catchswitch within %cleanpad [label %pad.inner] unwind to caller
+pad.inner:
+  %cp.inner = catchpad within %cs.inner [i32 1]
+  catchret from %cp.inner to label %join
+cleanret:
+  cleanupret from %cleanpad unwind to caller
+switch.outer:
+  %cs.outer = catchswitch within none [label %pad.outer] unwind to caller
+pad.outer:
+  %cp.outer = catchpad within %cs.outer [i32 2]
+  catchret from %cp.outer to label %join
+join:
+  %phi = phi i32 [ 1, %pad.inner ], [ 2, %pad.outer ]
+  call void @llvm.foo(i32 %phi)
+  unreachable
+exit:
+  ret void
+}
+; Both catchrets target %join, but the catchret from %cp.inner
+; returns to %cleanpad and the catchret from %cp.outer returns to the
+; main function, so %join needs to get cloned and one of the cleanuprets
+; needs to be updated to target the clone
+; CHECK-LABEL: define void @test14()
+; CHECK: catchret from %cp.inner to label %[[Clone1:.+]]
+; CHECK: catchret from %cp.outer to label %[[Clone2:.+]]
+; CHECK: [[Clone1]]:
+; CHECK-NEXT: call void @llvm.foo(i32 1)
+; CHECK-NEXT: unreachable
+; CHECK: [[Clone2]]:
+; CHECK-NEXT: call void @llvm.foo(i32 2)
+; CHECK-NEXT: unreachable
+
 ;; Debug info (from test12)
 
 ; Make sure the DISubprogram doesn't get cloned
diff --git a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
index a74aa2dd4623..dfb98bb1ab39 100644
--- a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
+++ b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
@@ -1,9 +1,7 @@
 ; RUN: llc -mcpu=generic -mtriple=x86_64-mingw32 < %s | FileCheck %s
-; CHECK: pushq   %rbp
-; CHECK: subq    $32, %rsp
-; CHECK: leaq    32(%rsp), %rbp
-; CHECK: movaps  %xmm8, -16(%rbp)
-; CHECK: movaps  %xmm7, -32(%rbp)
+; CHECK: subq    $40, %rsp
+; CHECK: movaps  %xmm8, 16(%rsp)
+; CHECK: movaps  %xmm7, (%rsp)
 
 define i32 @a() nounwind {
 entry:
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 4260e817b415..8378a022eab7 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -2,13 +2,13 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "x86_64-apple-macosx10.6.6"
-
-; Test that the order of operands is correct
-; CHECK: select_func
-; CHECK: pblendvb        {{LCPI0_[0-9]*}}(%rip), %xmm1
-; CHECK: ret
-
-define void @select_func(<8 x i16> %in) {
+
+; Test that the order of operands is correct
+; CHECK: select_func
+; CHECK: pblendvb        {{LCPI0_[0-9]*}}(%rip), %xmm1
+; CHECK: ret
+
+define void @select_func(<8 x i16> %in) {
 entry:
   %c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
diff --git a/test/CodeGen/X86/avx-cast.ll b/test/CodeGen/X86/avx-cast.ll
index b4798f159455..34c5dfaa0162 100644
--- a/test/CodeGen/X86/avx-cast.ll
+++ b/test/CodeGen/X86/avx-cast.ll
@@ -1,38 +1,27 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx  | FileCheck %s --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx  | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 
 ; Prefer a blend instruction to a vinsert128 instruction because blends
 ; are simpler (no lane changes) and therefore will have equal or better
 ; performance.
 
 define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castA:
-; AVX1:         vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: castA:
-; AVX2:         vxorps %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    retq
-
-entry:
+; AVX-LABEL: castA:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle.i
 }
 
 define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castB:
-; AVX1:         vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: castB:
-; AVX2:         vxorpd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX2-NEXT:    retq
-
-entry:
+; AVX-LABEL: castB:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT:    retq
   %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x double> %shuffle.i
 }
@@ -41,16 +30,16 @@ entry:
 
 define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
 ; AVX1-LABEL: castC:
-; AVX1:         vxorps %xmm1, %xmm1, %xmm1
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: castC:
-; AVX2:         vpxor %ymm1, %ymm1, %ymm1
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    retq
-
-entry:
   %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x i64> %shuffle.i
 }
@@ -59,43 +48,28 @@ entry:
 ; vzeroupper before the return, so just check for the absence of shuffles.
 
 define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castD:
-; AVX1-NOT:    extract
-; AVX1-NOT:    blend
-;
-; AVX2-LABEL: castD:
-; AVX2-NOT:    extract
-; AVX2-NOT:    blend
-
-entry:
+; AVX-LABEL: castD:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %shuffle.i
 }
 
 define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castE:
-; AVX1-NOT:    extract
-; AVX1-NOT:    blend
-;
-; AVX2-LABEL: castE:
-; AVX2-NOT:    extract
-; AVX2-NOT:    blend
-
-entry:
+; AVX-LABEL: castE:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castF:
-; AVX1-NOT:    extract
-; AVX1-NOT:    blend
-;
-; AVX2-LABEL: castF:
-; AVX2-NOT:    extract
-; AVX2-NOT:    blend
-
-entry:
+; AVX-LABEL: castF:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %shuffle.i
 }
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 764e13638485..5a17cdb29216 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6419,3 +6419,126 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x
   ret <8 x i64> %res5
 }
 
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> zeroinitializer, i8 %x3)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpslld $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpslld $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpslld $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 5f3d16d4efbb..064652aa470d 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2906,3 +2906,63 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <
   %res4 = add <32 x i16> %res3, %res2
   ret <32 x i16> %res4
 }
+
+declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsraw $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsraw $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 1db6756c23a8..6b2cb432f1cd 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -4591,3 +4591,126 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i8 %x1, <
   %res4 = add <16 x i16> %res3, %res2
   ret <16 x i16> %res4
 }
+
+declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16>, <8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index 29f17bbc0190..febd3d69dd18 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,18 +1,18 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
-
-define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
-  ; CHECK: test_x86_vbroadcastmw_512
-  ; CHECK: vpbroadcastmw2d %k0, %zmm0
-  %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ; 
-  ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
-
-define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
-  ; CHECK: test_x86_broadcastmb_512
-  ; CHECK: vpbroadcastmb2q %k0, %zmm0
-  %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ; 
-  ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
-
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+  ; CHECK: test_x86_vbroadcastmw_512
+  ; CHECK: vpbroadcastmw2d %k0, %zmm0
+  %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ; 
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+  ; CHECK: test_x86_broadcastmb_512
+  ; CHECK: vpbroadcastmb2q %k0, %zmm0
+  %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ; 
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index d9e8728c5ca6..8ab34bd8c436 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -5801,3 +5801,589 @@ define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x
   %res5 = add <8 x i32> %res3, %res4
   ret <8 x i32> %res5
 }
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq $255, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsrlq $255, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrlq $255, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 -1)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> zeroinitializer, i8 %x3)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res2, %res3
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlq $255, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsrlq $255, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsrlq $255, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 -1)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> zeroinitializer, i8 %x3)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res2, %res3
+  ret <4 x i64> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res2, %res3
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrld $255, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsrld $255, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrld $255, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 -1)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> zeroinitializer, i8 %x3)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res2, %res3
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrld $255, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsrld $255, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsrld $255, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 -1)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> zeroinitializer, i8 %x3)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res2, %res3
+  ret <8 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrld $255, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res2, %res3
+  ret <16 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrad $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsrad $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsrad $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsraq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsraq $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsraq $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsraq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsraq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsraq $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+
+declare <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpslld $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpslld $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpslld $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpslld $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpslld $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsllq $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsllq $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index e21ba2a14cf5..1665360e4990 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,11 +1,14 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
+; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
 
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
+
+; TODO: Reenable verify-machineinstr once the if (!AXDead) // FIXME
+; in X86InstrInfo::copyPhysReg() is resolved.
 
 declare i32 @foo()
 declare i32 @bar(i64)
@@ -58,9 +61,11 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 ; x8664-sahf-NEXT: popq %rax
 ; x8664-sahf-NEXT: movq %rax, %rdi
 ; x8664-sahf-NEXT: callq bar
+; x8664-sahf-NEXT: pushq %rax
 ; x8664-sahf-NEXT: movq [[FLAGS]], %rax
 ; x8664-sahf-NEXT: addb $127, %al
 ; x8664-sahf-NEXT: sahf
+; x8664-sahf-NEXT: popq %rax
 ; x8664-sahf-NEXT: jne
 
   %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
@@ -161,9 +166,11 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 
 ; x8664-sahf-LABEL: test_feed_cmov:
 ; x8664-sahf: cmpxchgl
-; x8664-sahf: seto %al
+; x8664-sahf: pushq %rax
+; x8664-sahf-NEXT: seto %al
 ; x8664-sahf-NEXT: lahf
 ; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf-NEXT: popq %rax
 ; x8664-sahf-NEXT: callq foo
 ; x8664-sahf-NEXT: pushq %rax
 ; x8664-sahf-NEXT: movq [[FLAGS]], %rax
diff --git a/test/CodeGen/X86/copy-eflags.ll b/test/CodeGen/X86/copy-eflags.ll
new file mode 100644
index 000000000000..796c1ecd8c71
--- /dev/null
+++ b/test/CodeGen/X86/copy-eflags.ll
@@ -0,0 +1,54 @@
+; RUN: llc -o - %s | FileCheck %s
+; This tests for the problem originally reported in http://llvm.org/PR25951
+target triple = "i686-unknown-linux-gnu"
+
+@b = common global i8 0, align 1
+@c = common global i32 0, align 4
+@a = common global i8 0, align 1
+@d = common global i8 0, align 1
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; CHECK-LABEL: func:
+; This tests whether eax is properly saved/restored around the lahf/sahf
+; instruction sequences.
+define i32 @func() {
+entry:
+  %bval = load i8, i8* @b
+  %inc = add i8 %bval, 1
+  store i8 %inc, i8* @b
+  %cval = load i32, i32* @c
+  %inc1 = add nsw i32 %cval, 1
+  store i32 %inc1, i32* @c
+  %aval = load i8, i8* @a
+  %inc2 = add i8 %aval, 1
+  store i8 %inc2, i8* @a
+; Copy flags produced by the incb of %inc1 to a register, need to save+restore
+; eax around it. The flags will be reused by %tobool.
+; CHECK: pushl %eax
+; CHECK: seto %al
+; CHECK: lahf
+; CHECK: movl %eax, [[REG:%[a-z]+]]
+; CHECK: popl %eax
+  %cmp = icmp eq i8 %aval, %bval
+  %conv5 = zext i1 %cmp to i8
+  store i8 %conv5, i8* @d
+  %tobool = icmp eq i32 %inc1, 0
+; We restore flags with an 'addb, sahf' sequence, need to save+restore eax
+; around it.
+; CHECK: pushl %eax
+; CHECK: movl [[REG]], %eax
+; CHECK: addb $127, %al
+; CHECK: sahf
+; CHECK: popl %eax
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %conv6 = sext i8 %inc to i32
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %conv6)
+  br label %if.end
+
+if.end:
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture readonly, ...)
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
index ec367c86526d..b38797e2d9dd 100644
--- a/test/CodeGen/X86/divrem8_ext.ll
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -97,4 +97,23 @@ define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
   ret i64 %2
 }
 
+define i64 @pr25754(i8 %a, i8 %c) {
+; CHECK-LABEL: pr25754
+; CHECK:    movzbl {{.+}}, %eax
+; CHECK:    divb
+; CHECK:    movzbl %ah, %ecx
+; CHECK:    movzbl %al, %eax
+; CHECK-32: addl %ecx, %eax
+; CHECK-32: sbbl %edx, %edx
+; CHECK-32: andl $1, %edx
+; CHECK-64: addq %rcx, %rax
+; CHECK:    ret
+  %r1 = urem i8 %a, %c
+  %d1 = udiv i8 %a, %c
+  %r2 = zext i8 %r1 to i64
+  %d2 = zext i8 %d1 to i64
+  %ret = add i64 %r2, %d2
+  ret i64 %ret
+}
+
 @z = external global i8
diff --git a/test/CodeGen/X86/fold-load-unops.ll b/test/CodeGen/X86/fold-load-unops.ll
index bedda3f297da..d2b03dde8319 100644
--- a/test/CodeGen/X86/fold-load-unops.ll
+++ b/test/CodeGen/X86/fold-load-unops.ll
@@ -2,17 +2,19 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
 
-; Verify that we're folding the load into the math instruction.
+; Verify we fold loads into unary sse intrinsics only when optimizing for size
 
 define float @rcpss(float* %a) {
 ; SSE-LABEL: rcpss:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    movss (%rdi), %xmm0
+; SSE-NEXT:    rcpss %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: rcpss:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovss (%rdi), %xmm0
+; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, float* %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -24,12 +26,14 @@ define float @rcpss(float* %a) {
 define float @rsqrtss(float* %a) {
 ; SSE-LABEL: rsqrtss:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    movss (%rdi), %xmm0
+; SSE-NEXT:    rsqrtss %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: rsqrtss:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovss (%rdi), %xmm0
+; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, float* %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -41,12 +45,14 @@ define float @rsqrtss(float* %a) {
 define float @sqrtss(float* %a) {
 ; SSE-LABEL: sqrtss:
 ; SSE:       # BB#0:
-; SSE-NEXT:    sqrtss (%rdi), %xmm0
+; SSE-NEXT:    movss (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrtss:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovss (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, float* %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -58,12 +64,14 @@ define float @sqrtss(float* %a) {
 define double @sqrtsd(double* %a) {
 ; SSE-LABEL: sqrtsd:
 ; SSE:       # BB#0:
-; SSE-NEXT:    sqrtsd (%rdi), %xmm0
+; SSE-NEXT:    movsd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrtsd:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovsd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
     %ld = load double, double* %a
     %ins = insertelement <2 x double> undef, double %ld, i32 0
@@ -72,9 +80,75 @@ define double @sqrtsd(double* %a) {
     ret double %ext
 }
 
+define float @rcpss_size(float* %a) optsize {
+; SSE-LABEL: rcpss_size:
+; SSE:       # BB#0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_size:
+; AVX:       # BB#0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define float @rsqrtss_size(float* %a) optsize {
+; SSE-LABEL: rsqrtss_size:
+; SSE:       # BB#0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_size:
+; AVX:       # BB#0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define float @sqrtss_size(float* %a) optsize{
+; SSE-LABEL: sqrtss_size:
+; SSE:       # BB#0:
+; SSE-NEXT:    sqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_size:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define double @sqrtsd_size(double* %a) optsize {
+; SSE-LABEL: sqrtsd_size:
+; SSE:       # BB#0:
+; SSE-NEXT:    sqrtsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_size:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load double, double* %a
+    %ins = insertelement <2 x double> undef, double %ld, i32 0
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
+    %ext = extractelement <2 x double> %res, i32 0
+    ret double %ext
+}
 
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
diff --git a/test/CodeGen/X86/fpcmp-soft-fp.ll b/test/CodeGen/X86/fpcmp-soft-fp.ll
index 58d57017d18a..dac468e5cbf0 100644
--- a/test/CodeGen/X86/fpcmp-soft-fp.ll
+++ b/test/CodeGen/X86/fpcmp-soft-fp.ll
@@ -1,127 +1,127 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium -mtriple=x86-linux-gnu -float-abi=soft | FileCheck %s 
-
-define i1 @test1(double %d) #0 {
-entry:
-  %cmp = fcmp ule double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test1:
-; CHECK: calll __gtdf2
-; CHECK: setle
-; CHECK: retl
- 
-define i1 @test2(double %d) #0 {
-entry:
-  %cmp = fcmp ult double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test2:
-; CHECK: calll __gedf2
-; CHECK: sets
-; CHECK: retl
-
-define i1 @test3(double %d) #0 {
-entry:
-  %cmp = fcmp ugt double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test3:
-; CHECK: calll __ledf2
-; CHECK: setg
-; CHECK: retl
-
-define i1 @test4(double %d) #0 {
-entry:
-  %cmp = fcmp uge double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test4:
-; CHECK: calll __ltdf2
-; CHECK: setns
-; CHECK: retl
-
-define i1 @test5(double %d) #0 {
-entry:
-  %cmp = fcmp ole double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test5:  
-; CHECK: calll __ledf2
-; CHECK: setle
-; CHECK: retl
-
-define i1 @test6(double %d) #0 {
-entry:
-  %cmp = fcmp olt double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test6:
-; CHECK: calll __ltdf2
-; CHECK: sets
-; CHECK: retl
-
-define i1 @test7(double %d) #0 {
-entry:
-  %cmp = fcmp ogt double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test7:
-; CHECK: calll __gtdf2
-; CHECK: setg
-; CHECK: retl
-
-define i1 @test8(double %d) #0 {
-entry:
-  %cmp = fcmp oge double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test8:
-; CHECK: calll __gedf2
-; CHECK: setns
-; CHECK: retl
-
-define i1 @test9(double %d) #0 {
-entry:
-  %cmp = fcmp oeq double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test9:
-; CHECK: calll __eqdf2
-; CHECK: sete
-; CHECK: retl
-
-define i1 @test10(double %d) #0 {
-entry:
-  %cmp = fcmp ueq double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test10:
-; CHECK: calll __eqdf2
-; CHECK: sete
-; CHECK: calll __unorddf2
-; CHECK: setne
-; CHECK: retl
-
-define i1 @test11(double %d) #0 {
-entry:
-  %cmp = fcmp one double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test11:
-; CHECK: calll __gtdf2
-; CHECK: setg
-; CHECK: calll __ltdf2
-; CHECK: sets
-; CHECK: retl
-
-define i1 @test12(double %d) #0 {
-entry:
-  %cmp = fcmp une double %d, 0.000000e+00
-  ret i1 %cmp
-}
-; CHECK-LABEL: test12:
-; CHECK: calll __nedf2
-; CHECK: setne
-; CHECK: retl
-
-attributes #0 = { "use-soft-float"="true" }
+; RUN: llc < %s -march=x86 -mcpu=pentium -mtriple=x86-linux-gnu -float-abi=soft | FileCheck %s 
+
+define i1 @test1(double %d) #0 {
+entry:
+  %cmp = fcmp ule double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test1:
+; CHECK: calll __gtdf2
+; CHECK: setle
+; CHECK: retl
+ 
+define i1 @test2(double %d) #0 {
+entry:
+  %cmp = fcmp ult double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test2:
+; CHECK: calll __gedf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test3(double %d) #0 {
+entry:
+  %cmp = fcmp ugt double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test3:
+; CHECK: calll __ledf2
+; CHECK: setg
+; CHECK: retl
+
+define i1 @test4(double %d) #0 {
+entry:
+  %cmp = fcmp uge double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test4:
+; CHECK: calll __ltdf2
+; CHECK: setns
+; CHECK: retl
+
+define i1 @test5(double %d) #0 {
+entry:
+  %cmp = fcmp ole double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test5:  
+; CHECK: calll __ledf2
+; CHECK: setle
+; CHECK: retl
+
+define i1 @test6(double %d) #0 {
+entry:
+  %cmp = fcmp olt double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test6:
+; CHECK: calll __ltdf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test7(double %d) #0 {
+entry:
+  %cmp = fcmp ogt double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test7:
+; CHECK: calll __gtdf2
+; CHECK: setg
+; CHECK: retl
+
+define i1 @test8(double %d) #0 {
+entry:
+  %cmp = fcmp oge double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test8:
+; CHECK: calll __gedf2
+; CHECK: setns
+; CHECK: retl
+
+define i1 @test9(double %d) #0 {
+entry:
+  %cmp = fcmp oeq double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test9:
+; CHECK: calll __eqdf2
+; CHECK: sete
+; CHECK: retl
+
+define i1 @test10(double %d) #0 {
+entry:
+  %cmp = fcmp ueq double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test10:
+; CHECK: calll __eqdf2
+; CHECK: sete
+; CHECK: calll __unorddf2
+; CHECK: setne
+; CHECK: retl
+
+define i1 @test11(double %d) #0 {
+entry:
+  %cmp = fcmp one double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test11:
+; CHECK: calll __gtdf2
+; CHECK: setg
+; CHECK: calll __ltdf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test12(double %d) #0 {
+entry:
+  %cmp = fcmp une double %d, 0.000000e+00
+  ret i1 %cmp
+}
+; CHECK-LABEL: test12:
+; CHECK: calll __nedf2
+; CHECK: setne
+; CHECK: retl
+
+attributes #0 = { "use-soft-float"="true" }
diff --git a/test/CodeGen/X86/inline-sse.ll b/test/CodeGen/X86/inline-sse.ll
index 78d6b762b5e5..08819b858293 100644
--- a/test/CodeGen/X86/inline-sse.ll
+++ b/test/CodeGen/X86/inline-sse.ll
@@ -21,11 +21,9 @@ define void @nop() nounwind {
 ;
 ; X64-LABEL: nop:
 ; X64:       # BB#0:
-; X64-NEXT:    subq    $24, %rsp
 ; X64-NEXT:    #APP
 ; X64-NEXT:    #NO_APP
-; X64-NEXT:    movaps %xmm0, (%rsp)
-; X64-NEXT:    addq    $24, %rsp
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
   %1 = alloca <4 x float>, align 16
   %2 = call <4 x float> asm "", "=x,~{dirflag},~{fpsr},~{flags}"()
diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll
new file mode 100644
index 000000000000..4e582de22a1f
--- /dev/null
+++ b/test/CodeGen/X86/insertelement-zero.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
+; SSE-LABEL: insert_v2f64_z1:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_v2f64_z1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x double> %a, double 0.0, i32 0
+  ret <2 x double> %1
+}
+
+define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
+; SSE-LABEL: insert_v4f64_0zz3:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm2, %xmm2
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_v4f64_0zz3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x double> %a, double 0.0, i32 1
+  %2 = insertelement <4 x double> %1, double 0.0, i32 2
+  ret <4 x double> %2
+}
+
+define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
+; SSE2-LABEL: insert_v2i64_z1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v2i64_z1:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorpd %xmm1, %xmm1
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v2i64_z1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorpd %xmm1, %xmm1
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v2i64_z1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    pinsrq $0, %rax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_v2i64_z1:
+; AVX:       # BB#0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vpinsrq $0, %rax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = insertelement <2 x i64> %a, i64 0, i32 0
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
+; SSE2-LABEL: insert_v4i64_01z3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorpd %xmm2, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v4i64_01z3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorpd %xmm2, %xmm2
+; SSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v4i64_01z3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorpd %xmm2, %xmm2
+; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v4i64_01z3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    pinsrq $0, %rax, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_v4i64_01z3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_v4i64_01z3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    vpinsrq $0, %rax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = insertelement <4 x i64> %a, i64 0, i32 2
+  ret <4 x i64> %1
+}
+
+define <4 x float> @insert_v4f32_01z3(<4 x float> %a) {
+; SSE2-LABEL: insert_v4f32_01z3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v4f32_01z3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v4f32_01z3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v4f32_01z3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_v4f32_01z3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x float> %a, float 0.0, i32 2
+  ret <4 x float> %1
+}
+
+define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
+; SSE2-LABEL: insert_v8f32_z12345z7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v8f32_z12345z7:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm2, %xmm2
+; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v8f32_z12345z7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm2, %xmm2
+; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v8f32_z12345z7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_v8f32_z12345z7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x float> %a, float 0.0, i32 0
+  %2 = insertelement <8 x float> %1, float 0.0, i32 6
+  ret <8 x float> %2
+}
+
+define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
+; SSE2-LABEL: insert_v4i32_01z3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v4i32_01z3:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorl %eax, %eax
+; SSE3-NEXT:    movd %eax, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v4i32_01z3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorl %eax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v4i32_01z3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_v4i32_01z3:
+; AVX:       # BB#0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = insertelement <4 x i32> %a, i32 0, i32 2
+  ret <4 x i32> %1
+}
+
+define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
+; SSE2-LABEL: insert_v8i32_z12345z7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v8i32_z12345z7:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm2, %xmm2
+; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE3-NEXT:    xorl %eax, %eax
+; SSE3-NEXT:    movd %eax, %xmm2
+; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v8i32_z12345z7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm2, %xmm2
+; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSSE3-NEXT:    xorl %eax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v8i32_z12345z7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    pinsrd $0, %eax, %xmm0
+; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_v8i32_z12345z7:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_v8i32_z12345z7:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = insertelement <8 x i32> %a, i32 0, i32 0
+  %2 = insertelement <8 x i32> %1, i32 0, i32 6
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
+; SSE-LABEL: insert_v8i16_z12345z7:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    pinsrw $0, %eax, %xmm0
+; SSE-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_v8i16_z12345z7:
+; AVX:       # BB#0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = insertelement <8 x i16> %a, i16 0, i32 0
+  %2 = insertelement <8 x i16> %1, i16 0, i32 6
+  ret <8 x i16> %2
+}
+
+define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
+; SSE-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    pinsrw $0, %eax, %xmm0
+; SSE-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE-NEXT:    pinsrw $7, %eax, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: insert_v16i16_z12345z789ABZDEz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_v16i16_z12345z789ABZDEz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = insertelement <16 x i16> %a, i16 0, i32 0
+  %2 = insertelement <16 x i16> %1, i16 0, i32 6
+  %3 = insertelement <16 x i16> %2, i16 0, i32 15
+  ret <16 x i16> %3
+}
+
+define <16 x i8> @insert_v16i8_z123456789ABZDEz(<16 x i8> %a) {
+; SSE2-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    xorl %eax, %eax
+; SSE3-NEXT:    movd %eax, %xmm2
+; SSE3-NEXT:    pandn %xmm2, %xmm1
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE3-NEXT:    pandn %xmm2, %xmm1
+; SSE3-NEXT:    por %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    xorl %eax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    pinsrb $0, %eax, %xmm0
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_v16i8_z123456789ABZDEz:
+; AVX:       # BB#0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = insertelement <16 x i8> %a, i8 0, i32 0
+  %2 = insertelement <16 x i8> %1, i8 0, i32 15
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
+; SSE2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT:    pandn %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pandn %xmm4, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE3-NEXT:    pand %xmm2, %xmm0
+; SSE3-NEXT:    xorl %eax, %eax
+; SSE3-NEXT:    movd %eax, %xmm3
+; SSE3-NEXT:    pandn %xmm3, %xmm2
+; SSE3-NEXT:    por %xmm2, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE3-NEXT:    pand %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSE3-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE3-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE3-NEXT:    pand %xmm5, %xmm1
+; SSE3-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE3-NEXT:    pandn %xmm3, %xmm5
+; SSE3-NEXT:    por %xmm5, %xmm1
+; SSE3-NEXT:    pand %xmm2, %xmm1
+; SSE3-NEXT:    pandn %xmm4, %xmm2
+; SSE3-NEXT:    por %xmm2, %xmm0
+; SSE3-NEXT:    por %xmm2, %xmm1
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    xorl %eax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; SSSE3-NEXT:    pshufb %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero
+; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm3, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    pinsrb $0, %eax, %xmm0
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    xorl %eax, %eax
+; AVX1-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = insertelement <32 x i8> %a, i8 0, i32 0
+  %2 = insertelement <32 x i8> %1, i8 0, i32 15
+  %3 = insertelement <32 x i8> %2, i8 0, i32 30
+  %4 = insertelement <32 x i8> %3, i8 0, i32 31
+  ret <32 x i8> %4
+}
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
new file mode 100644
index 000000000000..655f8f49f838
--- /dev/null
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
+; SSE-LABEL: shuffle_v4f32_0z27:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z27:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX-NEXT:    retq
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
+; SSE-LABEL: shuffle_v4f32_0zz4:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
+  %vecext = extractelement <4 x float> %xyzw, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
+; SSE-LABEL: shuffle_v4f32_0z24:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z24:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
+  %vecext = extractelement <4 x float> %xyzw, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %xyzw, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuffle_v4f32_0zz0(float %a) {
+; SSE-LABEL: shuffle_v4f32_0zz0:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz0:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT:    retq
+  %vecinit = insertelement <4 x float> undef, float %a, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit2, float %a, i32 3
+  ret <4 x float> %vecinit3
+}
+
+define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: shuffle_v4f32_0z6z:
+; SSE:       # BB#0:
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z6z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; AVX-NEXT:    retq
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecext2 = extractelement <4 x float> %B, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll
new file mode 100644
index 000000000000..49da8008b88c
--- /dev/null
+++ b/test/CodeGen/X86/materialize-one.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
+
+define i32 @one32() optsize {
+entry:
+  ret i32 1
+
+; CHECK32-LABEL: one32
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  ret
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
+define i32 @minus_one32() optsize {
+entry:
+  ret i32 -1
+
+; CHECK32-LABEL: minus_one32
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  ret
+}
+
+define i16 @one16() optsize {
+entry:
+  ret i16 1
+
+; CHECK32-LABEL: one16
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i16 @minus_one16() optsize {
+entry:
+  ret i16 -1
+
+; CHECK32-LABEL: minus_one16
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
+define i32 @test_rematerialization() optsize {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; -1 should be re-materialized here instead of getting spilled above.
+  ret i32 -1
+
+; CHECK32-LABEL: test_rematerialization
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NOT:   %eax
+; CHECK32:       retl
+}
+
+define i32 @test_rematerialization2(i32 %x) optsize {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; Define eflags.
+  %a = icmp ne i32 %x, 123
+  %b = zext i1 %a to i32
+  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+  ; It must therefore not use the xor-dec lowering.
+  %c = select i1 %a, i32 %b, i32 -1
+  ret i32 %c
+
+; CHECK32-LABEL: test_rematerialization2
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       cmpl
+; CHECK32:       setne
+; CHECK32-NOT:   xorl
+; CHECK32:       movl $-1
+; CHECK32:       cmov
+; CHECK32:       retl
+}
+
+declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll
deleted file mode 100644
index 695bf0fa5b98..000000000000
--- a/test/CodeGen/X86/materialize.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
-; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
-
-define i32 @one32_nooptsize() {
-entry:
-  ret i32 1
-
-; When not optimizing for size, use mov.
-; CHECK32-LABEL: one32_nooptsize:
-; CHECK32:       movl $1, %eax
-; CHECK32-NEXT:  retl
-; CHECK64-LABEL: one32_nooptsize:
-; CHECK64:       movl $1, %eax
-; CHECK64-NEXT:  retq
-}
-
-define i32 @one32() optsize {
-entry:
-  ret i32 1
-
-; CHECK32-LABEL: one32:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
-
-; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32:
-; CHECK64:       movl $1, %eax
-; CHECK64-NEXT:  retq
-}
-
-define i32 @one32_minsize() minsize {
-entry:
-  ret i32 1
-
-; On 32-bit, xor-inc is preferred over push-pop.
-; CHECK32-LABEL: one32_minsize:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
-
-; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
-; pop into a 64-bit register even when we just need 32 bits.
-; CHECK64-LABEL: one32_minsize:
-; CHECK64:       pushq $1
-; CHECK64:       .cfi_adjust_cfa_offset 8
-; CHECK64:       popq %rax
-; CHECK64:       .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT:  retq
-}
-
-define i64 @one64_minsize() minsize {
-entry:
-  ret i64 1
-; On 64-bit we don't do xor-inc yet, so push-pop it is.
-; CHECK64-LABEL: one64_minsize:
-; CHECK64:       pushq $1
-; CHECK64:       .cfi_adjust_cfa_offset 8
-; CHECK64:       popq %rax
-; CHECK64:       .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT:  retq
-
-; On Win64 we can't adjust the stack unless there's a frame pointer.
-; CHECKWIN64-LABEL: one64_minsize:
-; CHECKWIN64:       movl $1, %eax
-; CHECKWIN64-NEXT:  retq
-}
-
-define i32 @minus_one32() optsize {
-entry:
-  ret i32 -1
-
-; CHECK32-LABEL: minus_one32:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
-}
-
-define i32 @minus_one32_minsize() minsize {
-entry:
-  ret i32 -1
-
-; xor-dec is preferred over push-pop.
-; CHECK32-LABEL: minus_one32_minsize:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
-}
-
-define i16 @one16() optsize {
-entry:
-  ret i16 1
-
-; CHECK32-LABEL: one16:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
-}
-
-define i16 @minus_one16() optsize {
-entry:
-  ret i16 -1
-
-; CHECK32-LABEL: minus_one16:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
-}
-
-define i32 @minus_five32() minsize {
-entry:
-  ret i32 -5
-
-; CHECK32-LABEL: minus_five32:
-; CHECK32: pushl $-5
-; CHECK32: popl %eax
-; CHECK32: retl
-}
-
-define i64 @minus_five64() minsize {
-entry:
-  ret i64 -5
-
-; CHECK64-LABEL: minus_five64:
-; CHECK64: pushq $-5
-; CHECK64:       .cfi_adjust_cfa_offset 8
-; CHECK64: popq %rax
-; CHECK64:       .cfi_adjust_cfa_offset -8
-; CHECK64: retq
-}
-
-define i32 @rematerialize_minus_one() optsize {
-entry:
-  ; Materialize -1 (thiscall forces it into %ecx).
-  tail call x86_thiscallcc void @f(i32 -1)
-
-  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
-  ; spilling it to the stack.
-  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
-  ; -1 should be re-materialized here instead of getting spilled above.
-  ret i32 -1
-
-; CHECK32-LABEL: rematerialize_minus_one
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NOT:   %eax
-; CHECK32:       retl
-}
-
-define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
-entry:
-  ; Materialize -1 (thiscall forces it into %ecx).
-  tail call x86_thiscallcc void @f(i32 -1)
-
-  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
-  ; spilling it to the stack.
-  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
-  ; Define eflags.
-  %a = icmp ne i32 %x, 123
-  %b = zext i1 %a to i32
-  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
-  ; It must therefore not use the xor-dec lowering.
-  %c = select i1 %a, i32 %b, i32 -1
-  ret i32 %c
-
-; CHECK32-LABEL: rematerialize_minus_one_eflags
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       cmpl
-; CHECK32:       setne
-; CHECK32-NOT:   xorl
-; CHECK32:       movl $-1
-; CHECK32:       cmov
-; CHECK32:       retl
-}
-
-declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index bf457814079c..441fb02a89e6 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,5 +1,8 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+
+; TODO: Reenable verify-machineinstrs once the if (!AXDead) // FIXME in
+; X86InstrInfo::copyPhysReg() is resolved.
 
 ; The peephole optimizer can elide some physical register copies such as
 ; EFLAGS. Make sure the flags are used directly, instead of needlessly using
diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll
new file mode 100644
index 000000000000..8568cf43abc0
--- /dev/null
+++ b/test/CodeGen/X86/pku.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+declare i32 @llvm.x86.rdpkru()
+declare void @llvm.x86.wrpkru(i32)
+
+define void @test_x86_wrpkru(i32 %src) {
+; CHECK-LABEL: test_x86_wrpkru:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl    %ecx, %ecx
+; CHECK-NEXT:    xorl    %edx, %edx
+; CHECK-NEXT:    movl    %edi, %eax
+; CHECK-NEXT:    wrpkru
+; CHECK-NEXT:    retq
+  call void @llvm.x86.wrpkru(i32 %src) 
+  ret void
+}
+
+define i32 @test_x86_rdpkru() {
+; CHECK-LABEL: test_x86_rdpkru:
+; CHECK:      ## BB#0:
+; CHECK-NEXT: xorl    %ecx, %ecx
+; CHECK-NEXT: rdpkru
+; CHECK-NEXT: retq
+  %res = call i32 @llvm.x86.rdpkru() 
+  ret i32 %res 
+}
diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll
index 17d3e3e7d33c..88b5f4eb14b0 100644
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@@ -29,9 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize {
 define double @pow_wrapper_minsize(double %a) minsize {
 ; CHECK-LABEL: pow_wrapper_minsize:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movl  $128, %edi
+; CHECK-NEXT:    movl  $15, %edi
 ; CHECK-NEXT:    jmp
-  %ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; <double> [#uses=1]
+  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
   ret double %ret
 }
 
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
index 73c497014116..6c32a2206a7e 100644
--- a/test/CodeGen/X86/pr11415.ll
+++ b/test/CodeGen/X86/pr11415.ll
@@ -4,17 +4,15 @@
 ; defining %0 before it was read. This caused us to omit the
 ; movq	-8(%rsp), %rdx
 
-; CHECK: pushq	%rax
 ; CHECK: 	#APP
 ; CHECK-NEXT:	#NO_APP
 ; CHECK-NEXT:	movq	%rcx, %rax
-; CHECK-NEXT:	movq	%rax, (%rsp)
-; CHECK-NEXT:	movq	(%rsp), %rdx
+; CHECK-NEXT:	movq	%rax, -8(%rsp)
+; CHECK-NEXT:	movq	-8(%rsp), %rdx
 ; CHECK-NEXT:	#APP
 ; CHECK-NEXT:	#NO_APP
 ; CHECK-NEXT:	movq	%rdx, %rax
-; CHECK-NEXT:	movq	%rdx, (%rsp)
-; CHECK-NEXT:	popq	%rcx
+; CHECK-NEXT:	movq	%rdx, -8(%rsp)
 ; CHECK-NEXT:	ret
 
 define i64 @foo() {
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 59866c090543..f6dca609bc05 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -1,41 +1,41 @@
-; RUN: llc -mtriple=x86_64-linux -mcpu=corei7 < %s | FileCheck %s
-; This fixes a missing cases in the MI scheduler's constrainLocalCopy exposed by
-; PR21792
-
-@stuff = external constant [256 x double], align 16
-
-define void @func(<4 x float> %vx) {
-entry:
-  %tmp2 = bitcast <4 x float> %vx to <2 x i64>
-  %and.i = and <2 x i64> %tmp2, <i64 8727373547504, i64 8727373547504>
-  %tmp3 = bitcast <2 x i64> %and.i to <4 x i32>
-  %index.sroa.0.0.vec.extract = extractelement <4 x i32> %tmp3, i32 0
-  %idx.ext = sext i32 %index.sroa.0.0.vec.extract to i64
-  %add.ptr = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext
-  %tmp4 = bitcast i8* %add.ptr to double*
-  %index.sroa.0.4.vec.extract = extractelement <4 x i32> %tmp3, i32 1
-  %idx.ext5 = sext i32 %index.sroa.0.4.vec.extract to i64
-  %add.ptr6 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext5
-  %tmp5 = bitcast i8* %add.ptr6 to double*
-  %index.sroa.0.8.vec.extract = extractelement <4 x i32> %tmp3, i32 2
-  %idx.ext14 = sext i32 %index.sroa.0.8.vec.extract to i64
-  %add.ptr15 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext14
-  %tmp6 = bitcast i8* %add.ptr15 to double*
-  %index.sroa.0.12.vec.extract = extractelement <4 x i32> %tmp3, i32 3
-  %idx.ext19 = sext i32 %index.sroa.0.12.vec.extract to i64
-  %add.ptr20 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext19
-  %tmp7 = bitcast i8* %add.ptr20 to double*
-  %add.ptr46 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext
-  %tmp16 = bitcast i8* %add.ptr46 to double*
-  %add.ptr51 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext5
-  %tmp17 = bitcast i8* %add.ptr51 to double*
-  call void @toto(double* %tmp4, double* %tmp5, double* %tmp6, double* %tmp7, double* %tmp16, double* %tmp17)
-  ret void
-; CHECK-LABEL: func:
-; CHECK: pextrq  $1, %xmm0,
-; CHECK-NEXT: movd    %xmm0, %r[[AX:..]]
-; CHECK-NEXT: movslq  %e[[AX]],
-; CHECK-NEXT: sarq    $32, %r[[AX]]
-}
-
-declare void @toto(double*, double*, double*, double*, double*, double*)
+; RUN: llc -mtriple=x86_64-linux -mcpu=corei7 < %s | FileCheck %s
+; This fixes a missing cases in the MI scheduler's constrainLocalCopy exposed by
+; PR21792
+
+@stuff = external constant [256 x double], align 16
+
+define void @func(<4 x float> %vx) {
+entry:
+  %tmp2 = bitcast <4 x float> %vx to <2 x i64>
+  %and.i = and <2 x i64> %tmp2, <i64 8727373547504, i64 8727373547504>
+  %tmp3 = bitcast <2 x i64> %and.i to <4 x i32>
+  %index.sroa.0.0.vec.extract = extractelement <4 x i32> %tmp3, i32 0
+  %idx.ext = sext i32 %index.sroa.0.0.vec.extract to i64
+  %add.ptr = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext
+  %tmp4 = bitcast i8* %add.ptr to double*
+  %index.sroa.0.4.vec.extract = extractelement <4 x i32> %tmp3, i32 1
+  %idx.ext5 = sext i32 %index.sroa.0.4.vec.extract to i64
+  %add.ptr6 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext5
+  %tmp5 = bitcast i8* %add.ptr6 to double*
+  %index.sroa.0.8.vec.extract = extractelement <4 x i32> %tmp3, i32 2
+  %idx.ext14 = sext i32 %index.sroa.0.8.vec.extract to i64
+  %add.ptr15 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext14
+  %tmp6 = bitcast i8* %add.ptr15 to double*
+  %index.sroa.0.12.vec.extract = extractelement <4 x i32> %tmp3, i32 3
+  %idx.ext19 = sext i32 %index.sroa.0.12.vec.extract to i64
+  %add.ptr20 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext19
+  %tmp7 = bitcast i8* %add.ptr20 to double*
+  %add.ptr46 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext
+  %tmp16 = bitcast i8* %add.ptr46 to double*
+  %add.ptr51 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext5
+  %tmp17 = bitcast i8* %add.ptr51 to double*
+  call void @toto(double* %tmp4, double* %tmp5, double* %tmp6, double* %tmp7, double* %tmp16, double* %tmp17)
+  ret void
+; CHECK-LABEL: func:
+; CHECK: pextrq  $1, %xmm0,
+; CHECK-NEXT: movd    %xmm0, %r[[AX:..]]
+; CHECK-NEXT: movslq  %e[[AX]],
+; CHECK-NEXT: sarq    $32, %r[[AX]]
+}
+
+declare void @toto(double*, double*, double*, double*, double*, double*)
diff --git a/test/CodeGen/X86/pr24139.ll b/test/CodeGen/X86/pr24139.ll
index fbe55abcbf7c..ec56345ba648 100644
--- a/test/CodeGen/X86/pr24139.ll
+++ b/test/CodeGen/X86/pr24139.ll
@@ -1,148 +1,148 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
-
-; Check that we do not get excessive spilling from splitting of constant live ranges.
-
-; CHECK-LABEL: PR24139:
-; CHECK: # 16-byte Spill
-; CHECK-NOT: # 16-byte Spill
-; CHECK: retq
-
-define <2 x double> @PR24139(<2 x double> %arg, <2 x double> %arg1, <2 x double> %arg2) {
-  %tmp = bitcast <2 x double> %arg to <4 x float>
-  %tmp3 = fmul <4 x float> %tmp, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
-  %tmp4 = bitcast <2 x double> %arg to <4 x i32>
-  %tmp5 = and <4 x i32> %tmp4, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %tmp6 = or <4 x i32> %tmp5, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
-  %tmp7 = bitcast <4 x i32> %tmp6 to <4 x float>
-  %tmp8 = fadd <4 x float> %tmp3, %tmp7
-  %tmp9 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp8) #2
-  %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
-  %tmp11 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp9) #2
-  %tmp12 = fmul <4 x float> %tmp11, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
-  %tmp13 = fsub <4 x float> %tmp, %tmp12
-  %tmp14 = fmul <4 x float> %tmp11, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
-  %tmp15 = fsub <4 x float> %tmp13, %tmp14
-  %tmp16 = fmul <4 x float> %tmp15, %tmp15
-  %tmp17 = fmul <4 x float> %tmp15, %tmp16
-  %tmp18 = fmul <4 x float> %tmp16, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
-  %tmp19 = fadd <4 x float> %tmp18, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
-  %tmp20 = fmul <4 x float> %tmp16, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
-  %tmp21 = fadd <4 x float> %tmp20, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
-  %tmp22 = fmul <4 x float> %tmp16, %tmp19
-  %tmp23 = fadd <4 x float> %tmp22, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
-  %tmp24 = fmul <4 x float> %tmp16, %tmp21
-  %tmp25 = fadd <4 x float> %tmp24, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
-  %tmp26 = fmul <4 x float> %tmp16, %tmp23
-  %tmp27 = fadd <4 x float> %tmp26, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  %tmp28 = fmul <4 x float> %tmp17, %tmp25
-  %tmp29 = fadd <4 x float> %tmp15, %tmp28
-  %tmp30 = and <2 x i64> %tmp10, <i64 4294967297, i64 4294967297>
-  %tmp31 = bitcast <2 x i64> %tmp30 to <4 x i32>
-  %tmp32 = icmp eq <4 x i32> %tmp31, zeroinitializer
-  %tmp33 = sext <4 x i1> %tmp32 to <4 x i32>
-  %tmp34 = bitcast <4 x i32> %tmp33 to <4 x float>
-  %tmp35 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp27, <4 x float> %tmp29, <4 x float> %tmp34) #2
-  %tmp36 = and <2 x i64> %tmp10, <i64 8589934594, i64 8589934594>
-  %tmp37 = bitcast <2 x i64> %tmp36 to <4 x i32>
-  %tmp38 = icmp eq <4 x i32> %tmp37, zeroinitializer
-  %tmp39 = sext <4 x i1> %tmp38 to <4 x i32>
-  %tmp40 = bitcast <4 x float> %tmp35 to <4 x i32>
-  %tmp41 = xor <4 x i32> %tmp40, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %tmp42 = bitcast <4 x i32> %tmp41 to <4 x float>
-  %tmp43 = bitcast <4 x i32> %tmp39 to <4 x float>
-  %tmp44 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp42, <4 x float> %tmp35, <4 x float> %tmp43) #2
-  %tmp45 = bitcast <2 x double> %arg1 to <4 x float>
-  %tmp46 = fmul <4 x float> %tmp45, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
-  %tmp47 = bitcast <2 x double> %arg1 to <4 x i32>
-  %tmp48 = and <4 x i32> %tmp47, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %tmp49 = or <4 x i32> %tmp48, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
-  %tmp50 = bitcast <4 x i32> %tmp49 to <4 x float>
-  %tmp51 = fadd <4 x float> %tmp46, %tmp50
-  %tmp52 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp51) #2
-  %tmp53 = bitcast <4 x i32> %tmp52 to <2 x i64>
-  %tmp54 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp52) #2
-  %tmp55 = fmul <4 x float> %tmp54, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
-  %tmp56 = fsub <4 x float> %tmp45, %tmp55
-  %tmp57 = fmul <4 x float> %tmp54, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
-  %tmp58 = fsub <4 x float> %tmp56, %tmp57
-  %tmp59 = fmul <4 x float> %tmp58, %tmp58
-  %tmp60 = fmul <4 x float> %tmp58, %tmp59
-  %tmp61 = fmul <4 x float> %tmp59, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
-  %tmp62 = fadd <4 x float> %tmp61, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
-  %tmp63 = fmul <4 x float> %tmp59, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
-  %tmp64 = fadd <4 x float> %tmp63, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
-  %tmp65 = fmul <4 x float> %tmp59, %tmp62
-  %tmp66 = fadd <4 x float> %tmp65, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
-  %tmp67 = fmul <4 x float> %tmp59, %tmp64
-  %tmp68 = fadd <4 x float> %tmp67, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
-  %tmp69 = fmul <4 x float> %tmp59, %tmp66
-  %tmp70 = fadd <4 x float> %tmp69, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  %tmp71 = fmul <4 x float> %tmp60, %tmp68
-  %tmp72 = fadd <4 x float> %tmp58, %tmp71
-  %tmp73 = and <2 x i64> %tmp53, <i64 4294967297, i64 4294967297>
-  %tmp74 = bitcast <2 x i64> %tmp73 to <4 x i32>
-  %tmp75 = icmp eq <4 x i32> %tmp74, zeroinitializer
-  %tmp76 = sext <4 x i1> %tmp75 to <4 x i32>
-  %tmp77 = bitcast <4 x i32> %tmp76 to <4 x float>
-  %tmp78 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp70, <4 x float> %tmp72, <4 x float> %tmp77) #2
-  %tmp79 = and <2 x i64> %tmp53, <i64 8589934594, i64 8589934594>
-  %tmp80 = bitcast <2 x i64> %tmp79 to <4 x i32>
-  %tmp81 = icmp eq <4 x i32> %tmp80, zeroinitializer
-  %tmp82 = sext <4 x i1> %tmp81 to <4 x i32>
-  %tmp83 = bitcast <4 x float> %tmp78 to <4 x i32>
-  %tmp84 = xor <4 x i32> %tmp83, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %tmp85 = bitcast <4 x i32> %tmp84 to <4 x float>
-  %tmp86 = bitcast <4 x i32> %tmp82 to <4 x float>
-  %tmp87 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp85, <4 x float> %tmp78, <4 x float> %tmp86) #2
-  %tmp88 = fadd <4 x float> %tmp44, %tmp87
-  %tmp89 = bitcast <2 x double> %arg2 to <4 x float>
-  %tmp90 = fmul <4 x float> %tmp89, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
-  %tmp91 = bitcast <2 x double> %arg2 to <4 x i32>
-  %tmp92 = and <4 x i32> %tmp91, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %tmp93 = or <4 x i32> %tmp92, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
-  %tmp94 = bitcast <4 x i32> %tmp93 to <4 x float>
-  %tmp95 = fadd <4 x float> %tmp90, %tmp94
-  %tmp96 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp95) #2
-  %tmp97 = bitcast <4 x i32> %tmp96 to <2 x i64>
-  %tmp98 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp96) #2
-  %tmp99 = fmul <4 x float> %tmp98, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
-  %tmp100 = fsub <4 x float> %tmp89, %tmp99
-  %tmp101 = fmul <4 x float> %tmp98, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
-  %tmp102 = fsub <4 x float> %tmp100, %tmp101
-  %tmp103 = fmul <4 x float> %tmp102, %tmp102
-  %tmp104 = fmul <4 x float> %tmp102, %tmp103
-  %tmp105 = fmul <4 x float> %tmp103, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
-  %tmp106 = fadd <4 x float> %tmp105, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
-  %tmp107 = fmul <4 x float> %tmp103, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
-  %tmp108 = fadd <4 x float> %tmp107, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
-  %tmp109 = fmul <4 x float> %tmp103, %tmp106
-  %tmp110 = fadd <4 x float> %tmp109, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
-  %tmp111 = fmul <4 x float> %tmp103, %tmp108
-  %tmp112 = fadd <4 x float> %tmp111, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
-  %tmp113 = fmul <4 x float> %tmp103, %tmp110
-  %tmp114 = fadd <4 x float> %tmp113, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  %tmp115 = fmul <4 x float> %tmp104, %tmp112
-  %tmp116 = fadd <4 x float> %tmp102, %tmp115
-  %tmp117 = and <2 x i64> %tmp97, <i64 4294967297, i64 4294967297>
-  %tmp118 = bitcast <2 x i64> %tmp117 to <4 x i32>
-  %tmp119 = icmp eq <4 x i32> %tmp118, zeroinitializer
-  %tmp120 = sext <4 x i1> %tmp119 to <4 x i32>
-  %tmp121 = bitcast <4 x i32> %tmp120 to <4 x float>
-  %tmp122 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp114, <4 x float> %tmp116, <4 x float> %tmp121) #2
-  %tmp123 = and <2 x i64> %tmp97, <i64 8589934594, i64 8589934594>
-  %tmp124 = bitcast <2 x i64> %tmp123 to <4 x i32>
-  %tmp125 = icmp eq <4 x i32> %tmp124, zeroinitializer
-  %tmp126 = sext <4 x i1> %tmp125 to <4 x i32>
-  %tmp127 = bitcast <4 x float> %tmp122 to <4 x i32>
-  %tmp128 = xor <4 x i32> %tmp127, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %tmp129 = bitcast <4 x i32> %tmp128 to <4 x float>
-  %tmp130 = bitcast <4 x i32> %tmp126 to <4 x float>
-  %tmp131 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp129, <4 x float> %tmp122, <4 x float> %tmp130) #2
-  %tmp132 = fadd <4 x float> %tmp88, %tmp131
-  %tmp133 = bitcast <4 x float> %tmp132 to <2 x double>
-  ret <2 x double> %tmp133
-}
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+
+; Check that we do not get excessive spilling from splitting of constant live ranges.
+
+; CHECK-LABEL: PR24139:
+; CHECK: # 16-byte Spill
+; CHECK-NOT: # 16-byte Spill
+; CHECK: retq
+
+define <2 x double> @PR24139(<2 x double> %arg, <2 x double> %arg1, <2 x double> %arg2) {
+  %tmp = bitcast <2 x double> %arg to <4 x float>
+  %tmp3 = fmul <4 x float> %tmp, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
+  %tmp4 = bitcast <2 x double> %arg to <4 x i32>
+  %tmp5 = and <4 x i32> %tmp4, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp6 = or <4 x i32> %tmp5, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  %tmp7 = bitcast <4 x i32> %tmp6 to <4 x float>
+  %tmp8 = fadd <4 x float> %tmp3, %tmp7
+  %tmp9 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp8) #2
+  %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
+  %tmp11 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp9) #2
+  %tmp12 = fmul <4 x float> %tmp11, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
+  %tmp13 = fsub <4 x float> %tmp, %tmp12
+  %tmp14 = fmul <4 x float> %tmp11, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
+  %tmp15 = fsub <4 x float> %tmp13, %tmp14
+  %tmp16 = fmul <4 x float> %tmp15, %tmp15
+  %tmp17 = fmul <4 x float> %tmp15, %tmp16
+  %tmp18 = fmul <4 x float> %tmp16, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
+  %tmp19 = fadd <4 x float> %tmp18, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
+  %tmp20 = fmul <4 x float> %tmp16, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
+  %tmp21 = fadd <4 x float> %tmp20, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
+  %tmp22 = fmul <4 x float> %tmp16, %tmp19
+  %tmp23 = fadd <4 x float> %tmp22, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
+  %tmp24 = fmul <4 x float> %tmp16, %tmp21
+  %tmp25 = fadd <4 x float> %tmp24, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
+  %tmp26 = fmul <4 x float> %tmp16, %tmp23
+  %tmp27 = fadd <4 x float> %tmp26, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %tmp28 = fmul <4 x float> %tmp17, %tmp25
+  %tmp29 = fadd <4 x float> %tmp15, %tmp28
+  %tmp30 = and <2 x i64> %tmp10, <i64 4294967297, i64 4294967297>
+  %tmp31 = bitcast <2 x i64> %tmp30 to <4 x i32>
+  %tmp32 = icmp eq <4 x i32> %tmp31, zeroinitializer
+  %tmp33 = sext <4 x i1> %tmp32 to <4 x i32>
+  %tmp34 = bitcast <4 x i32> %tmp33 to <4 x float>
+  %tmp35 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp27, <4 x float> %tmp29, <4 x float> %tmp34) #2
+  %tmp36 = and <2 x i64> %tmp10, <i64 8589934594, i64 8589934594>
+  %tmp37 = bitcast <2 x i64> %tmp36 to <4 x i32>
+  %tmp38 = icmp eq <4 x i32> %tmp37, zeroinitializer
+  %tmp39 = sext <4 x i1> %tmp38 to <4 x i32>
+  %tmp40 = bitcast <4 x float> %tmp35 to <4 x i32>
+  %tmp41 = xor <4 x i32> %tmp40, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp42 = bitcast <4 x i32> %tmp41 to <4 x float>
+  %tmp43 = bitcast <4 x i32> %tmp39 to <4 x float>
+  %tmp44 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp42, <4 x float> %tmp35, <4 x float> %tmp43) #2
+  %tmp45 = bitcast <2 x double> %arg1 to <4 x float>
+  %tmp46 = fmul <4 x float> %tmp45, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
+  %tmp47 = bitcast <2 x double> %arg1 to <4 x i32>
+  %tmp48 = and <4 x i32> %tmp47, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp49 = or <4 x i32> %tmp48, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  %tmp50 = bitcast <4 x i32> %tmp49 to <4 x float>
+  %tmp51 = fadd <4 x float> %tmp46, %tmp50
+  %tmp52 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp51) #2
+  %tmp53 = bitcast <4 x i32> %tmp52 to <2 x i64>
+  %tmp54 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp52) #2
+  %tmp55 = fmul <4 x float> %tmp54, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
+  %tmp56 = fsub <4 x float> %tmp45, %tmp55
+  %tmp57 = fmul <4 x float> %tmp54, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
+  %tmp58 = fsub <4 x float> %tmp56, %tmp57
+  %tmp59 = fmul <4 x float> %tmp58, %tmp58
+  %tmp60 = fmul <4 x float> %tmp58, %tmp59
+  %tmp61 = fmul <4 x float> %tmp59, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
+  %tmp62 = fadd <4 x float> %tmp61, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
+  %tmp63 = fmul <4 x float> %tmp59, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
+  %tmp64 = fadd <4 x float> %tmp63, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
+  %tmp65 = fmul <4 x float> %tmp59, %tmp62
+  %tmp66 = fadd <4 x float> %tmp65, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
+  %tmp67 = fmul <4 x float> %tmp59, %tmp64
+  %tmp68 = fadd <4 x float> %tmp67, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
+  %tmp69 = fmul <4 x float> %tmp59, %tmp66
+  %tmp70 = fadd <4 x float> %tmp69, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %tmp71 = fmul <4 x float> %tmp60, %tmp68
+  %tmp72 = fadd <4 x float> %tmp58, %tmp71
+  %tmp73 = and <2 x i64> %tmp53, <i64 4294967297, i64 4294967297>
+  %tmp74 = bitcast <2 x i64> %tmp73 to <4 x i32>
+  %tmp75 = icmp eq <4 x i32> %tmp74, zeroinitializer
+  %tmp76 = sext <4 x i1> %tmp75 to <4 x i32>
+  %tmp77 = bitcast <4 x i32> %tmp76 to <4 x float>
+  %tmp78 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp70, <4 x float> %tmp72, <4 x float> %tmp77) #2
+  %tmp79 = and <2 x i64> %tmp53, <i64 8589934594, i64 8589934594>
+  %tmp80 = bitcast <2 x i64> %tmp79 to <4 x i32>
+  %tmp81 = icmp eq <4 x i32> %tmp80, zeroinitializer
+  %tmp82 = sext <4 x i1> %tmp81 to <4 x i32>
+  %tmp83 = bitcast <4 x float> %tmp78 to <4 x i32>
+  %tmp84 = xor <4 x i32> %tmp83, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp85 = bitcast <4 x i32> %tmp84 to <4 x float>
+  %tmp86 = bitcast <4 x i32> %tmp82 to <4 x float>
+  %tmp87 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp85, <4 x float> %tmp78, <4 x float> %tmp86) #2
+  %tmp88 = fadd <4 x float> %tmp44, %tmp87
+  %tmp89 = bitcast <2 x double> %arg2 to <4 x float>
+  %tmp90 = fmul <4 x float> %tmp89, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
+  %tmp91 = bitcast <2 x double> %arg2 to <4 x i32>
+  %tmp92 = and <4 x i32> %tmp91, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp93 = or <4 x i32> %tmp92, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  %tmp94 = bitcast <4 x i32> %tmp93 to <4 x float>
+  %tmp95 = fadd <4 x float> %tmp90, %tmp94
+  %tmp96 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp95) #2
+  %tmp97 = bitcast <4 x i32> %tmp96 to <2 x i64>
+  %tmp98 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp96) #2
+  %tmp99 = fmul <4 x float> %tmp98, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
+  %tmp100 = fsub <4 x float> %tmp89, %tmp99
+  %tmp101 = fmul <4 x float> %tmp98, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
+  %tmp102 = fsub <4 x float> %tmp100, %tmp101
+  %tmp103 = fmul <4 x float> %tmp102, %tmp102
+  %tmp104 = fmul <4 x float> %tmp102, %tmp103
+  %tmp105 = fmul <4 x float> %tmp103, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
+  %tmp106 = fadd <4 x float> %tmp105, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
+  %tmp107 = fmul <4 x float> %tmp103, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
+  %tmp108 = fadd <4 x float> %tmp107, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
+  %tmp109 = fmul <4 x float> %tmp103, %tmp106
+  %tmp110 = fadd <4 x float> %tmp109, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
+  %tmp111 = fmul <4 x float> %tmp103, %tmp108
+  %tmp112 = fadd <4 x float> %tmp111, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
+  %tmp113 = fmul <4 x float> %tmp103, %tmp110
+  %tmp114 = fadd <4 x float> %tmp113, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %tmp115 = fmul <4 x float> %tmp104, %tmp112
+  %tmp116 = fadd <4 x float> %tmp102, %tmp115
+  %tmp117 = and <2 x i64> %tmp97, <i64 4294967297, i64 4294967297>
+  %tmp118 = bitcast <2 x i64> %tmp117 to <4 x i32>
+  %tmp119 = icmp eq <4 x i32> %tmp118, zeroinitializer
+  %tmp120 = sext <4 x i1> %tmp119 to <4 x i32>
+  %tmp121 = bitcast <4 x i32> %tmp120 to <4 x float>
+  %tmp122 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp114, <4 x float> %tmp116, <4 x float> %tmp121) #2
+  %tmp123 = and <2 x i64> %tmp97, <i64 8589934594, i64 8589934594>
+  %tmp124 = bitcast <2 x i64> %tmp123 to <4 x i32>
+  %tmp125 = icmp eq <4 x i32> %tmp124, zeroinitializer
+  %tmp126 = sext <4 x i1> %tmp125 to <4 x i32>
+  %tmp127 = bitcast <4 x float> %tmp122 to <4 x i32>
+  %tmp128 = xor <4 x i32> %tmp127, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %tmp129 = bitcast <4 x i32> %tmp128 to <4 x float>
+  %tmp130 = bitcast <4 x i32> %tmp126 to <4 x float>
+  %tmp131 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp129, <4 x float> %tmp122, <4 x float> %tmp130) #2
+  %tmp132 = fadd <4 x float> %tmp88, %tmp131
+  %tmp133 = bitcast <4 x float> %tmp132 to <2 x double>
+  ret <2 x double> %tmp133
+}
+
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index 8665edf8f1d5..c4da546ed77e 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
 
 ; Test ADDSUB ISel patterns.
 
@@ -101,6 +102,62 @@ define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {
   ret <2 x double> %vecinit2
 }
 
+define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
+; SSE-LABEL: test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubps %xmm4, %xmm0
+; SSE-NEXT:    addsubps %xmm5, %xmm1
+; SSE-NEXT:    addsubps %xmm6, %xmm2
+; SSE-NEXT:    addsubps %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test5:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vaddsubps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vaddsubps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test5:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vsubps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,17,2,19,4,21,6,23,8,25,10,27,12,29,14,31]
+; AVX512-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %add = fadd <16 x float> %A, %B
+  %sub = fsub <16 x float> %A, %B
+  %vecinit2 = shufflevector <16 x float> %sub, <16 x float> %add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x float> %vecinit2
+}
+
+define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
+; SSE-LABEL: test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsubpd %xmm4, %xmm0
+; SSE-NEXT:    addsubpd %xmm5, %xmm1
+; SSE-NEXT:    addsubpd %xmm6, %xmm2
+; SSE-NEXT:    addsubpd %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test6:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vaddsubpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vaddsubpd %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test6:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vsubpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,9,2,11,4,13,6,15]
+; AVX512-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %add = fadd <8 x double> %A, %B
+  %sub = fsub <8 x double> %A, %B
+  %vecinit2 = shufflevector <8 x double> %sub, <8 x double> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x double> %vecinit2
+}
+
 define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
 ; SSE-LABEL: test1b:
 ; SSE:       # BB#0:
diff --git a/test/CodeGen/X86/statepoint-far-call.ll b/test/CodeGen/X86/statepoint-far-call.ll
index 2ebf38c5c019..dc49061f6461 100644
--- a/test/CodeGen/X86/statepoint-far-call.ll
+++ b/test/CodeGen/X86/statepoint-far-call.ll
@@ -1,22 +1,22 @@
-; RUN: llc < %s | FileCheck %s
-; Test to check that Statepoints with X64 far-immediate targets
-; are lowered correctly to an indirect call via a scratch register.
-
-target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-win64"
-
-define void @test_far_call() gc "statepoint-example" {
-; CHECK-LABEL: test_far_call
-; CHECK: pushq %rax
-; CHECK: movabsq $140727162896504, %rax 
-; CHECK: callq *%rax
-; CHECK: popq %rax
-; CHECK: retq
-
-entry:
-  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* inttoptr (i64 140727162896504 to void ()*), i32 0, i32 0, i32 0, i32 0)  
-  ret void
-}
-
-declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
-
+; RUN: llc < %s | FileCheck %s
+; Test to check that Statepoints with X64 far-immediate targets
+; are lowered correctly to an indirect call via a scratch register.
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-win64"
+
+define void @test_far_call() gc "statepoint-example" {
+; CHECK-LABEL: test_far_call
+; CHECK: pushq %rax
+; CHECK: movabsq $140727162896504, %rax 
+; CHECK: callq *%rax
+; CHECK: popq %rax
+; CHECK: retq
+
+entry:
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* inttoptr (i64 140727162896504 to void ()*), i32 0, i32 0, i32 0, i32 0)  
+  ret void
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsave.ll b/test/CodeGen/X86/system-intrinsics-64-xsave.ll
index feec9516220b..62cd625e4f23 100644
--- a/test/CodeGen/X86/system-intrinsics-64-xsave.ll
+++ b/test/CodeGen/X86/system-intrinsics-64-xsave.ll
@@ -1,41 +1,41 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave | FileCheck %s
-
-define void @test_xsave(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsave
-; CHECK: movl  %edx, %eax
-; CHECK: movl  %esi, %edx
-; CHECK: xsave (%rdi)
-  call void @llvm.x86.xsave(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsave(i8*, i32, i32)
-
-define void @test_xsave64(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsave64
-; CHECK: movl    %edx, %eax
-; CHECK: movl    %esi, %edx
-; CHECK: xsave64 (%rdi)
-  call void @llvm.x86.xsave64(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsave64(i8*, i32, i32)
-
-define void @test_xrstor(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xrstor
-; CHECK: movl   %edx, %eax
-; CHECK: movl   %esi, %edx
-; CHECK: xrstor (%rdi)
-  call void @llvm.x86.xrstor(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xrstor(i8*, i32, i32)
-
-define void @test_xrstor64(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xrstor64
-; CHECK: movl     %edx, %eax
-; CHECK: movl     %esi, %edx
-; CHECK: xrstor64 (%rdi)
-  call void @llvm.x86.xrstor64(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xrstor64(i8*, i32, i32)
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave | FileCheck %s
+
+define void @test_xsave(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsave
+; CHECK: movl  %edx, %eax
+; CHECK: movl  %esi, %edx
+; CHECK: xsave (%rdi)
+  call void @llvm.x86.xsave(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsave(i8*, i32, i32)
+
+define void @test_xsave64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsave64
+; CHECK: movl    %edx, %eax
+; CHECK: movl    %esi, %edx
+; CHECK: xsave64 (%rdi)
+  call void @llvm.x86.xsave64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsave64(i8*, i32, i32)
+
+define void @test_xrstor(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstor
+; CHECK: movl   %edx, %eax
+; CHECK: movl   %esi, %edx
+; CHECK: xrstor (%rdi)
+  call void @llvm.x86.xrstor(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstor(i8*, i32, i32)
+
+define void @test_xrstor64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstor64
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xrstor64 (%rdi)
+  call void @llvm.x86.xrstor64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstor64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsavec.ll b/test/CodeGen/X86/system-intrinsics-64-xsavec.ll
index 068034886515..c1c5cbd0471e 100644
--- a/test/CodeGen/X86/system-intrinsics-64-xsavec.ll
+++ b/test/CodeGen/X86/system-intrinsics-64-xsavec.ll
@@ -1,21 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave,+xsavec | FileCheck %s
-
-define void @test_xsavec(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsavec
-; CHECK: movl   %edx, %eax
-; CHECK: movl   %esi, %edx
-; CHECK: xsavec (%rdi)
-  call void @llvm.x86.xsavec(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsavec(i8*, i32, i32)
-
-define void @test_xsavec64(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsavec64
-; CHECK: movl     %edx, %eax
-; CHECK: movl     %esi, %edx
-; CHECK: xsavec64 (%rdi)
-  call void @llvm.x86.xsavec64(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsavec64(i8*, i32, i32)
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave,+xsavec | FileCheck %s
+
+define void @test_xsavec(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsavec
+; CHECK: movl   %edx, %eax
+; CHECK: movl   %esi, %edx
+; CHECK: xsavec (%rdi)
+  call void @llvm.x86.xsavec(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsavec(i8*, i32, i32)
+
+define void @test_xsavec64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsavec64
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xsavec64 (%rdi)
+  call void @llvm.x86.xsavec64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsavec64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll b/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll
index ee0a5360da8e..49603d4e2160 100644
--- a/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll
+++ b/test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll
@@ -1,21 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsaveopt | FileCheck %s
-
-define void @test_xsaveopt(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsaveopt
-; CHECK: movl     %edx, %eax
-; CHECK: movl     %esi, %edx
-; CHECK: xsaveopt (%rdi)
-  call void @llvm.x86.xsaveopt(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsaveopt(i8*, i32, i32)
-
-define void @test_xsaveopt64(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsaveopt64
-; CHECK: movl       %edx, %eax
-; CHECK: movl       %esi, %edx
-; CHECK: xsaveopt64 (%rdi)
-  call void @llvm.x86.xsaveopt64(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsaveopt64(i8*, i32, i32)
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsaveopt | FileCheck %s
+
+define void @test_xsaveopt(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaveopt
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xsaveopt (%rdi)
+  call void @llvm.x86.xsaveopt(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaveopt(i8*, i32, i32)
+
+define void @test_xsaveopt64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaveopt64
+; CHECK: movl       %edx, %eax
+; CHECK: movl       %esi, %edx
+; CHECK: xsaveopt64 (%rdi)
+  call void @llvm.x86.xsaveopt64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaveopt64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-64-xsaves.ll b/test/CodeGen/X86/system-intrinsics-64-xsaves.ll
index 5c1c5be4e7e2..08d90f5a5a89 100644
--- a/test/CodeGen/X86/system-intrinsics-64-xsaves.ll
+++ b/test/CodeGen/X86/system-intrinsics-64-xsaves.ll
@@ -1,41 +1,41 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave,+xsaves | FileCheck %s
-
-define void @test_xsaves(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsaves
-; CHECK: movl   %edx, %eax
-; CHECK: movl   %esi, %edx
-; CHECK: xsaves (%rdi)
-  call void @llvm.x86.xsaves(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsaves(i8*, i32, i32)
-
-define void @test_xsaves64(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsaves64
-; CHECK: movl     %edx, %eax
-; CHECK: movl     %esi, %edx
-; CHECK: xsaves64 (%rdi)
-  call void @llvm.x86.xsaves64(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsaves64(i8*, i32, i32)
-
-define void @test_xrstors(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xrstors
-; CHECK: movl    %edx, %eax
-; CHECK: movl    %esi, %edx
-; CHECK: xrstors (%rdi)
-  call void @llvm.x86.xrstors(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xrstors(i8*, i32, i32)
-
-define void @test_xrstors64(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xrstors64
-; CHECK: movl      %edx, %eax
-; CHECK: movl      %esi, %edx
-; CHECK: xrstors64 (%rdi)
-  call void @llvm.x86.xrstors64(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xrstors64(i8*, i32, i32)
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xsave,+xsaves | FileCheck %s
+
+define void @test_xsaves(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaves
+; CHECK: movl   %edx, %eax
+; CHECK: movl   %esi, %edx
+; CHECK: xsaves (%rdi)
+  call void @llvm.x86.xsaves(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaves(i8*, i32, i32)
+
+define void @test_xsaves64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaves64
+; CHECK: movl     %edx, %eax
+; CHECK: movl     %esi, %edx
+; CHECK: xsaves64 (%rdi)
+  call void @llvm.x86.xsaves64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaves64(i8*, i32, i32)
+
+define void @test_xrstors(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstors
+; CHECK: movl    %edx, %eax
+; CHECK: movl    %esi, %edx
+; CHECK: xrstors (%rdi)
+  call void @llvm.x86.xrstors(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstors(i8*, i32, i32)
+
+define void @test_xrstors64(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstors64
+; CHECK: movl      %edx, %eax
+; CHECK: movl      %esi, %edx
+; CHECK: xrstors64 (%rdi)
+  call void @llvm.x86.xrstors64(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstors64(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsave.ll b/test/CodeGen/X86/system-intrinsics-xsave.ll
index ff9fb7e247a4..deaf1bec3a7e 100644
--- a/test/CodeGen/X86/system-intrinsics-xsave.ll
+++ b/test/CodeGen/X86/system-intrinsics-xsave.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave | FileCheck %s
-
-define void @test_xsave(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsave
-; CHECK: movl  8(%esp), %edx
-; CHECK: movl  12(%esp), %eax
-; CHECK: movl  4(%esp), %ecx
-; CHECK: xsave (%ecx)
-  call void @llvm.x86.xsave(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsave(i8*, i32, i32)
-
-define void @test_xrstor(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xrstor
-; CHECK: movl   8(%esp), %edx
-; CHECK: movl   12(%esp), %eax
-; CHECK: movl   4(%esp), %ecx
-; CHECK: xrstor (%ecx)
-  call void @llvm.x86.xrstor(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xrstor(i8*, i32, i32)
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave | FileCheck %s
+
+define void @test_xsave(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsave
+; CHECK: movl  8(%esp), %edx
+; CHECK: movl  12(%esp), %eax
+; CHECK: movl  4(%esp), %ecx
+; CHECK: xsave (%ecx)
+  call void @llvm.x86.xsave(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsave(i8*, i32, i32)
+
+define void @test_xrstor(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstor
+; CHECK: movl   8(%esp), %edx
+; CHECK: movl   12(%esp), %eax
+; CHECK: movl   4(%esp), %ecx
+; CHECK: xrstor (%ecx)
+  call void @llvm.x86.xrstor(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstor(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsavec.ll b/test/CodeGen/X86/system-intrinsics-xsavec.ll
index 4a55ea9531b1..a4576078f84a 100644
--- a/test/CodeGen/X86/system-intrinsics-xsavec.ll
+++ b/test/CodeGen/X86/system-intrinsics-xsavec.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsavec | FileCheck %s
-
-define void @test_xsavec(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsavec
-; CHECK: movl   8(%esp), %edx
-; CHECK: movl   12(%esp), %eax
-; CHECK: movl   4(%esp), %ecx
-; CHECK: xsavec (%ecx)
-  call void @llvm.x86.xsavec(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsavec(i8*, i32, i32)
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsavec | FileCheck %s
+
+define void @test_xsavec(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsavec
+; CHECK: movl   8(%esp), %edx
+; CHECK: movl   12(%esp), %eax
+; CHECK: movl   4(%esp), %ecx
+; CHECK: xsavec (%ecx)
+  call void @llvm.x86.xsavec(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsavec(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsaveopt.ll b/test/CodeGen/X86/system-intrinsics-xsaveopt.ll
index f9bd7acd5a7c..4bef3fd40ab7 100644
--- a/test/CodeGen/X86/system-intrinsics-xsaveopt.ll
+++ b/test/CodeGen/X86/system-intrinsics-xsaveopt.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsaveopt | FileCheck %s
-
-define void @test_xsaveopt(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsaveopt
-; CHECK: movl     8(%esp), %edx
-; CHECK: movl     12(%esp), %eax
-; CHECK: movl     4(%esp), %ecx
-; CHECK: xsaveopt (%ecx)
-  call void @llvm.x86.xsaveopt(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsaveopt(i8*, i32, i32)
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsaveopt | FileCheck %s
+
+define void @test_xsaveopt(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaveopt
+; CHECK: movl     8(%esp), %edx
+; CHECK: movl     12(%esp), %eax
+; CHECK: movl     4(%esp), %ecx
+; CHECK: xsaveopt (%ecx)
+  call void @llvm.x86.xsaveopt(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaveopt(i8*, i32, i32)
diff --git a/test/CodeGen/X86/system-intrinsics-xsaves.ll b/test/CodeGen/X86/system-intrinsics-xsaves.ll
index ca1c5c1a9ed0..840bbbced2cd 100644
--- a/test/CodeGen/X86/system-intrinsics-xsaves.ll
+++ b/test/CodeGen/X86/system-intrinsics-xsaves.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsaves | FileCheck %s
-
-define void @test_xsaves(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xsaves
-; CHECK: movl   8(%esp), %edx
-; CHECK: movl   12(%esp), %eax
-; CHECK: movl   4(%esp), %ecx
-; CHECK: xsaves (%ecx)
-  call void @llvm.x86.xsaves(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xsaves(i8*, i32, i32)
-
-define void @test_xrstors(i8* %ptr, i32 %hi, i32 %lo) {
-; CHECK-LABEL: test_xrstors
-; CHECK: movl    8(%esp), %edx
-; CHECK: movl    12(%esp), %eax
-; CHECK: movl    4(%esp), %ecx
-; CHECK: xrstors (%ecx)
-  call void @llvm.x86.xrstors(i8* %ptr, i32 %hi, i32 %lo)
-  ret void;
-}
-declare void @llvm.x86.xrstors(i8*, i32, i32)
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+xsave,+xsaves | FileCheck %s
+
+define void @test_xsaves(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xsaves
+; CHECK: movl   8(%esp), %edx
+; CHECK: movl   12(%esp), %eax
+; CHECK: movl   4(%esp), %ecx
+; CHECK: xsaves (%ecx)
+  call void @llvm.x86.xsaves(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xsaves(i8*, i32, i32)
+
+define void @test_xrstors(i8* %ptr, i32 %hi, i32 %lo) {
+; CHECK-LABEL: test_xrstors
+; CHECK: movl    8(%esp), %edx
+; CHECK: movl    12(%esp), %eax
+; CHECK: movl    4(%esp), %ecx
+; CHECK: xrstors (%ecx)
+  call void @llvm.x86.xrstors(i8* %ptr, i32 %hi, i32 %lo)
+  ret void;
+}
+declare void @llvm.x86.xrstors(i8*, i32, i32)
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 6d4f8287cab6..4f72c66ecba2 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,15 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=i686-apple-darwin9 | FileCheck %s
+
 ; MMX insertelement is not available; these are promoted to XMM.
 ; (Without SSE they are split to two ints, and the code is much better.)
 
-define x86_mmx @mmx_movzl(x86_mmx %x) nounwind  {
-entry:
-; CHECK: mmx_movzl
-; CHECK: pinsrd
-; CHECK: pinsrd
-        %tmp = bitcast x86_mmx %x to <2 x i32> 
-	%tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0		; <<2 x i32>> [#uses=1]
-	%tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1		; <<2 x i32>> [#uses=1]
-        %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
-	ret x86_mmx %tmp9
+define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
+; CHECK-LABEL: mmx_movzl:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    subl $20, %esp
+; CHECK-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    pinsrd $0, %eax, %xmm0
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    pinsrd $1, %eax, %xmm0
+; CHECK-NEXT:    pinsrd $2, %eax, %xmm0
+; CHECK-NEXT:    pinsrd $3, %eax, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    movq %xmm0, (%esp)
+; CHECK-NEXT:    movq (%esp), %mm0
+; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    retl
+  %tmp = bitcast x86_mmx %x to <2 x i32>
+  %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0		; <<2 x i32>> [#uses=1]
+  %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1		; <<2 x i32>> [#uses=1]
+  %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
+  ret x86_mmx %tmp9
 }
diff --git a/test/CodeGen/X86/vec_partial.ll b/test/CodeGen/X86/vec_partial.ll
index 709f326e5027..469667a28a76 100644
--- a/test/CodeGen/X86/vec_partial.ll
+++ b/test/CodeGen/X86/vec_partial.ll
@@ -1,32 +1,32 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-
-; PR11580
-define <3 x float> @addf3(<3 x float> %x) {
-; CHECK-LABEL: addf3
-; CHECK:       # BB#0:
-; CHECK-NEXT:  addps .LCPI0_0(%rip), %xmm0
-; CHECK-NEXT:  retq
-entry:
-  %add = fadd <3 x float> %x, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  ret <3 x float> %add
-}
-
-; PR11580
-define <4 x float> @cvtf3_f4(<3 x float> %x) {
-; CHECK-LABEL: cvtf3_f4
-; CHECK:       # BB#0:
-; CHECK-NEXT:  retq
-entry:
-  %extractVec = shufflevector <3 x float> %x, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  ret <4 x float> %extractVec
-}
-
-; PR11580
-define <3 x float> @cvtf4_f3(<4 x float> %x) {
-; CHECK-LABEL: cvtf4_f3
-; CHECK:       # BB#0:
-; CHECK-NEXT:  retq
-entry:
-  %extractVec = shufflevector <4 x float> %x, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %extractVec
-}
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; PR11580
+define <3 x float> @addf3(<3 x float> %x) {
+; CHECK-LABEL: addf3
+; CHECK:       # BB#0:
+; CHECK-NEXT:  addps .LCPI0_0(%rip), %xmm0
+; CHECK-NEXT:  retq
+entry:
+  %add = fadd <3 x float> %x, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  ret <3 x float> %add
+}
+
+; PR11580
+define <4 x float> @cvtf3_f4(<3 x float> %x) {
+; CHECK-LABEL: cvtf3_f4
+; CHECK:       # BB#0:
+; CHECK-NEXT:  retq
+entry:
+  %extractVec = shufflevector <3 x float> %x, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x float> %extractVec
+}
+
+; PR11580
+define <3 x float> @cvtf4_f3(<4 x float> %x) {
+; CHECK-LABEL: cvtf4_f3
+; CHECK:       # BB#0:
+; CHECK-NEXT:  retq
+entry:
+  %extractVec = shufflevector <4 x float> %x, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %extractVec
+}
diff --git a/test/CodeGen/X86/vec_reassociate.ll b/test/CodeGen/X86/vec_reassociate.ll
index bf2053f78424..0d3373528f58 100644
--- a/test/CodeGen/X86/vec_reassociate.ll
+++ b/test/CodeGen/X86/vec_reassociate.ll
@@ -1,119 +1,119 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s
-
-define <4 x i32> @add_4i32(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @add_4i32
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   paddd %xmm1, %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = add <4 x i32> %a0, <i32  1, i32 -2, i32  3, i32 -4>
-  %2 = add <4 x i32> %a1, <i32 -1, i32  2, i32 -3, i32  4>
-  %3 = add <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @add_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @add_4i32_commute
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   paddd %xmm1, %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = add <4 x i32> <i32  1, i32 -2, i32  3, i32 -4>, %a0
-  %2 = add <4 x i32> <i32 -1, i32  2, i32 -3, i32  4>, %a1
-  %3 = add <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @mul_4i32
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
-  ;CHECK-NEXT:   pmulld .LCPI2_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
-  %2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
-  %3 = mul <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @mul_4i32_commute
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
-  ;CHECK-NEXT:   pmulld .LCPI3_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
-  %2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
-  %3 = mul <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @and_4i32(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @and_4i32
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   andps %xmm1, %xmm0
-  ;CHECK-NEXT:   andps .LCPI4_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = and <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
-  %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
-  %3 = and <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @and_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @and_4i32_commute
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   andps %xmm1, %xmm0
-  ;CHECK-NEXT:   andps .LCPI5_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = and <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
-  %2 = and <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
-  %3 = and <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @or_4i32(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @or_4i32
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   orps %xmm1, %xmm0
-  ;CHECK-NEXT:   orps .LCPI6_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = or <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
-  %2 = or <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
-  %3 = or <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @or_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @or_4i32_commute
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   orps %xmm1, %xmm0
-  ;CHECK-NEXT:   orps .LCPI7_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = or <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0 
-  %2 = or <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
-  %3 = or <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @xor_4i32(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @xor_4i32
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   xorps %xmm1, %xmm0
-  ;CHECK-NEXT:   xorps .LCPI8_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = xor <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
-  %2 = xor <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
-  %3 = xor <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
-
-define <4 x i32> @xor_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL:  @xor_4i32_commute
-  ;CHECK:        # BB#0:
-  ;CHECK-NEXT:   xorps %xmm1, %xmm0
-  ;CHECK-NEXT:   xorps .LCPI9_0(%rip), %xmm0
-  ;CHECK-NEXT:   retq
-  %1 = xor <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
-  %2 = xor <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
-  %3 = xor <4 x i32> %1, %2
-  ret <4 x i32> %3
-}
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s
+
+define <4 x i32> @add_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @add_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   paddd %xmm1, %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = add <4 x i32> %a0, <i32  1, i32 -2, i32  3, i32 -4>
+  %2 = add <4 x i32> %a1, <i32 -1, i32  2, i32 -3, i32  4>
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @add_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @add_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   paddd %xmm1, %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = add <4 x i32> <i32  1, i32 -2, i32  3, i32 -4>, %a0
+  %2 = add <4 x i32> <i32 -1, i32  2, i32 -3, i32  4>, %a1
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @mul_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
+  ;CHECK-NEXT:   pmulld .LCPI2_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
+  %3 = mul <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @mul_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
+  ;CHECK-NEXT:   pmulld .LCPI3_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
+  %2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
+  %3 = mul <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @and_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @and_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   andps %xmm1, %xmm0
+  ;CHECK-NEXT:   andps .LCPI4_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = and <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = and <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @and_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @and_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   andps %xmm1, %xmm0
+  ;CHECK-NEXT:   andps .LCPI5_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = and <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
+  %2 = and <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = and <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @or_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @or_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   orps %xmm1, %xmm0
+  ;CHECK-NEXT:   orps .LCPI6_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = or <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = or <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @or_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @or_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   orps %xmm1, %xmm0
+  ;CHECK-NEXT:   orps .LCPI7_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = or <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0 
+  %2 = or <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @xor_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @xor_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   xorps %xmm1, %xmm0
+  ;CHECK-NEXT:   xorps .LCPI8_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = xor <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = xor <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = xor <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @xor_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @xor_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   xorps %xmm1, %xmm0
+  ;CHECK-NEXT:   xorps .LCPI9_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = xor <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
+  %2 = xor <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = xor <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 35c3b708fd08..53dbb32235ae 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1005,6 +1005,145 @@ define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %shuffle
 }
 
+define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_0zz4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0zz4:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm2, %xmm2
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE3-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE3-NEXT:    movaps %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0zz4:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm2, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0zz4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = zero,zero,zero,xmm1[0]
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = zero,zero,zero,xmm1[0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
+  %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle1
+}
+
+define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_0zz6:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0zz6:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0zz6:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0zz6:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
+  %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle1
+}
+
+define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_0z24:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0z24:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0z24:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0z24:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0z24:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
+  %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle1
+}
+
 define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
 ; SSE2-LABEL: shuffle_v4i32_4zzz:
 ; SSE2:       # BB#0:
diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll
index 27d78dbe5479..fd7156ed2feb 100644
--- a/test/CodeGen/X86/win64_frame.ll
+++ b/test/CodeGen/X86/win64_frame.ll
@@ -128,11 +128,9 @@ entry:
   ; CHECK:      .seh_setframe 5, 0
   ; CHECK:      .seh_endprologue
 
-  %call = call i64 asm sideeffect "pushf\0A\09popq $0\0A", "=r,~{dirflag},~{fpsr},~{flags}"()
-  ; CHECK-NEXT: #APP
+  %call = call i64 @llvm.x86.flags.read.u64()
   ; CHECK-NEXT: pushfq
   ; CHECK-NEXT: popq    %rax
-  ; CHECK:      #NO_APP
 
   ret i64 %call
   ; CHECK-NEXT: popq    %rbp
@@ -187,5 +185,6 @@ define i64 @f10(i64* %foo, i64 %bar, i64 %baz) {
 }
 
 declare i8* @llvm.returnaddress(i32) nounwind readnone
+declare i64 @llvm.x86.flags.read.u64()
 
 declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/X86/wineh-coreclr.ll b/test/CodeGen/X86/wineh-coreclr.ll
index b61876827cac..a7e40c036e73 100644
--- a/test/CodeGen/X86/wineh-coreclr.ll
+++ b/test/CodeGen/X86/wineh-coreclr.ll
@@ -26,34 +26,34 @@ declare i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token)
 ;   }
 ;   f(8);
 ; }
-
+;
 ; CHECK-LABEL: test1:     # @test1
-; CHECK-NEXT: [[L_begin:.*func_begin.*]]:
+; CHECK-NEXT: [[test1_begin:.*func_begin.*]]:
 define void @test1() personality i8* bitcast (void ()* @ProcessCLRException to i8*) {
 entry:
 ; CHECK: # %entry
 ; CHECK: leaq [[FPOffset:[0-9]+]](%rsp), %rbp
 ; CHECK: .seh_endprologue
 ; CHECK: movq %rsp, [[PSPSymOffset:[0-9]+]](%rsp)
-; CHECK: [[L_before_f1:.+]]:
+; CHECK: [[test1_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
-; CHECK-NEXT: [[L_after_f1:.+]]:
+; CHECK-NEXT: [[test1_after_f1:.+]]:
   invoke void @f(i32 1)
-    to label %inner_try unwind label %finally.pad
+    to label %inner_try unwind label %finally
 inner_try:
 ; CHECK: # %inner_try
-; CHECK: [[L_before_f2:.+]]:
+; CHECK: [[test1_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
-; CHECK-NEXT: [[L_after_f2:.+]]:
+; CHECK-NEXT: [[test1_after_f2:.+]]:
   invoke void @f(i32 2)
-    to label %finally.clone unwind label %catch1.pad
-catch1.pad:
-  %cs1 = catchswitch within none [label %catch1.body, label %catch2.body] unwind label %finally.pad
-catch1.body:
-  %catch1 = catchpad within %cs1 [i32 1]
-; CHECK: .seh_proc [[L_catch1:[^ ]+]]
+    to label %finally.clone unwind label %exn.dispatch
+exn.dispatch:
+  %catchswitch = catchswitch within none [label %catch1, label %catch2] unwind label %finally
+catch1:
+  %catch.pad1 = catchpad within %catchswitch [i32 1]
+; CHECK: .seh_proc [[test1_catch1:[^ ]+]]
 ; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
 ;                        ^ all funclets use the same frame size
 ; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
@@ -64,19 +64,19 @@ catch1.body:
 ; CHECK: movq %rdx, %rcx
 ;             ^ exception pointer passed in rdx
 ; CHECK-NEXT: callq g
-  %exn1 = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch1)
-  call void @g(i8 addrspace(1)* %exn1) [ "funclet"(token %catch1) ]
-; CHECK: [[L_before_f3:.+]]:
+  %exn1 = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch.pad1)
+  call void @g(i8 addrspace(1)* %exn1) [ "funclet"(token %catch.pad1) ]
+; CHECK: [[test1_before_f3:.+]]:
 ; CHECK-NEXT: movl $3, %ecx
 ; CHECK-NEXT: callq f
-; CHECK-NEXT: [[L_after_f3:.+]]:
-  invoke void @f(i32 3) [ "funclet"(token %catch1) ]
-    to label %catch1.ret unwind label %finally.pad
+; CHECK-NEXT: [[test1_after_f3:.+]]:
+  invoke void @f(i32 3) [ "funclet"(token %catch.pad1) ]
+    to label %catch1.ret unwind label %finally
 catch1.ret:
-  catchret from %catch1 to label %finally.clone
-catch2.body:
-  %catch2 = catchpad within %cs1 [i32 2]
-; CHECK: .seh_proc [[L_catch2:[^ ]+]]
+  catchret from %catch.pad1 to label %finally.clone
+catch2:
+  %catch.pad2 = catchpad within %catchswitch [i32 2]
+; CHECK: .seh_proc [[test1_catch2:[^ ]+]]
 ; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
 ;                        ^ all funclets use the same frame size
 ; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
@@ -87,25 +87,25 @@ catch2.body:
 ; CHECK: movq %rdx, %rcx
 ;             ^ exception pointer passed in rdx
 ; CHECK-NEXT: callq g
-  %exn2 = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch2)
-  call void @g(i8 addrspace(1)* %exn2) [ "funclet"(token %catch2) ]
-; CHECK: [[L_before_f4:.+]]:
+  %exn2 = call i8 addrspace(1)* @llvm.eh.exceptionpointer.p1i8(token %catch.pad2)
+  call void @g(i8 addrspace(1)* %exn2) [ "funclet"(token %catch.pad2) ]
+; CHECK: [[test1_before_f4:.+]]:
 ; CHECK-NEXT: movl $4, %ecx
 ; CHECK-NEXT: callq f
-; CHECK-NEXT: [[L_after_f4:.+]]:
-  invoke void @f(i32 4) [ "funclet"(token %catch2) ]
-    to label %try_in_catch unwind label %finally.pad
+; CHECK-NEXT: [[test1_after_f4:.+]]:
+  invoke void @f(i32 4) [ "funclet"(token %catch.pad2) ]
+    to label %try_in_catch unwind label %finally
 try_in_catch:
 ; CHECK: # %try_in_catch
-; CHECK: [[L_before_f5:.+]]:
+; CHECK: [[test1_before_f5:.+]]:
 ; CHECK-NEXT: movl $5, %ecx
 ; CHECK-NEXT: callq f
-; CHECK-NEXT: [[L_after_f5:.+]]:
-  invoke void @f(i32 5) [ "funclet"(token %catch2) ]
-    to label %catch2.ret unwind label %fault.pad
-fault.pad:
-; CHECK: .seh_proc [[L_fault:[^ ]+]]
-  %fault = cleanuppad within none [i32 undef]
+; CHECK-NEXT: [[test1_after_f5:.+]]:
+  invoke void @f(i32 5) [ "funclet"(token %catch.pad2) ]
+    to label %catch2.ret unwind label %fault
+fault:
+; CHECK: .seh_proc [[test1_fault:[^ ]+]]
+  %fault.pad = cleanuppad within %catch.pad2 [i32 undef]
 ; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
 ;                        ^ all funclets use the same frame size
 ; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
@@ -113,22 +113,22 @@ fault.pad:
 ; CHECK: movq %rcx, [[PSPSymOffset]](%rsp)
 ; CHECK: leaq [[FPOffset]](%rcx), %rbp
 ; CHECK: .seh_endprologue
-; CHECK: [[L_before_f6:.+]]:
+; CHECK: [[test1_before_f6:.+]]:
 ; CHECK-NEXT: movl $6, %ecx
 ; CHECK-NEXT: callq f
-; CHECK-NEXT: [[L_after_f6:.+]]:
-  invoke void @f(i32 6) [ "funclet"(token %fault) ]
-    to label %fault.ret unwind label %finally.pad
+; CHECK-NEXT: [[test1_after_f6:.+]]:
+  invoke void @f(i32 6) [ "funclet"(token %fault.pad) ]
+    to label %fault.ret unwind label %finally
 fault.ret:
-  cleanupret from %fault unwind label %finally.pad
+  cleanupret from %fault.pad unwind label %finally
 catch2.ret:
-  catchret from %catch2 to label %finally.clone
+  catchret from %catch.pad2 to label %finally.clone
 finally.clone:
   call void @f(i32 7)
   br label %tail
-finally.pad:
-; CHECK: .seh_proc [[L_finally:[^ ]+]]
-  %finally = cleanuppad within none []
+finally:
+; CHECK: .seh_proc [[test1_finally:[^ ]+]]
+  %finally.pad = cleanuppad within none []
 ; CHECK: .seh_stackalloc [[FuncletFrameSize:[0-9]+]]
 ;                        ^ all funclets use the same frame size
 ; CHECK: movq [[PSPSymOffset]](%rcx), %rcx
@@ -138,130 +138,555 @@ finally.pad:
 ; CHECK: .seh_endprologue
 ; CHECK-NEXT: movl $7, %ecx
 ; CHECK-NEXT: callq f
-  call void @f(i32 7) [ "funclet"(token %finally) ]
-  cleanupret from %finally unwind to caller
+  call void @f(i32 7) [ "funclet"(token %finally.pad) ]
+  cleanupret from %finally.pad unwind to caller
 tail:
   call void @f(i32 8)
   ret void
-; CHECK: [[L_end:.*func_end.*]]:
+; CHECK: [[test1_end:.*func_end.*]]:
 }
 
-; FIXME: Verify that the new clauses are correct and re-enable these checks.
-
 ; Now check for EH table in xdata (following standard xdata)
-; CHECKX-LABEL: .section .xdata
+; CHECK-LABEL: .section .xdata
 ; standard xdata comes here
-; CHECKX:      .long 4{{$}}
+; CHECK:      .long 4{{$}}
 ;                   ^ number of funclets
-; CHECKX-NEXT: .long [[L_catch1]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_catch1]]-[[test1_begin]]
 ;                   ^ offset from L_begin to start of 1st funclet
-; CHECKX-NEXT: .long [[L_catch2]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_catch2]]-[[test1_begin]]
 ;                   ^ offset from L_begin to start of 2nd funclet
-; CHECKX-NEXT: .long [[L_fault]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_fault]]-[[test1_begin]]
 ;                   ^ offset from L_begin to start of 3rd funclet
-; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]]
 ;                   ^ offset from L_begin to start of 4th funclet
-; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_end]]-[[test1_begin]]
 ;                   ^ offset from L_begin to end of last funclet
-; CHECKX-NEXT: .long 7
+; CHECK-NEXT: .long 7
 ;                   ^ number of EH clauses
 ; Clause 1: call f(2) is guarded by catch1
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ flags (0 => catch handler)
-; CHECKX-NEXT: .long ([[L_before_f2]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f2]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f2]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f2]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_catch1]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_catch1]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_catch2]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_catch2]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 1
+; CHECK-NEXT: .long 1
 ;                   ^ type token of catch (from catchpad)
 ; Clause 2: call f(2) is also guarded by catch2
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ flags (0 => catch handler)
-; CHECKX-NEXT: .long ([[L_before_f2]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f2]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f2]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f2]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_catch2]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_catch2]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_fault]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_fault]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 2
+; CHECK-NEXT: .long 2
 ;                   ^ type token of catch (from catchpad)
 ; Clause 3: calls f(1) and f(2) are guarded by finally
-; CHECKX-NEXT: .long 2
+; CHECK-NEXT: .long 2
 ;                   ^ flags (2 => finally handler)
-; CHECKX-NEXT: .long ([[L_before_f1]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f1]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f2]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f2]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_end]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ type token slot (null for finally)
 ; Clause 4: call f(3) is guarded by finally
 ;           This is a "duplicate" because the protected range (f(3))
 ;           is in funclet catch1 but the finally's immediate parent
 ;           is the main function, not that funclet.
-; CHECKX-NEXT: .long 10
+; CHECK-NEXT: .long 10
 ;                   ^ flags (2 => finally handler | 8 => duplicate)
-; CHECKX-NEXT: .long ([[L_before_f3]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f3]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f3]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f3]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_end]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ type token slot (null for finally)
 ; Clause 5: call f(5) is guarded by fault
-; CHECKX-NEXT: .long 4
+; CHECK-NEXT: .long 4
 ;                   ^ flags (4 => fault handler)
-; CHECKX-NEXT: .long ([[L_before_f5]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f5]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f5]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f5]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_fault]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_fault]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ type token slot (null for fault)
 ; Clause 6: calls f(4) and f(5) are guarded by finally
 ;           This is a "duplicate" because the protected range (f(4)-f(5))
 ;           is in funclet catch2 but the finally's immediate parent
 ;           is the main function, not that funclet.
-; CHECKX-NEXT: .long 10
+; CHECK-NEXT: .long 10
 ;                   ^ flags (2 => finally handler | 8 => duplicate)
-; CHECKX-NEXT: .long ([[L_before_f4]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f4]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f5]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f5]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_end]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ type token slot (null for finally)
 ; Clause 7: call f(6) is guarded by finally
 ;           This is a "duplicate" because the protected range (f(3))
 ;           is in funclet catch1 but the finally's immediate parent
 ;           is the main function, not that funclet.
-; CHECKX-NEXT: .long 10
+; CHECK-NEXT: .long 10
 ;                   ^ flags (2 => finally handler | 8 => duplicate)
-; CHECKX-NEXT: .long ([[L_before_f6]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_before_f6]]-[[test1_begin]])+1
 ;                   ^ offset of start of clause
-; CHECKX-NEXT: .long ([[L_after_f6]]-[[L_begin]])+1
+; CHECK-NEXT: .long ([[test1_after_f6]]-[[test1_begin]])+1
 ;                   ^ offset of end of clause
-; CHECKX-NEXT: .long [[L_finally]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]]
 ;                   ^ offset of start of handler
-; CHECKX-NEXT: .long [[L_end]]-[[L_begin]]
+; CHECK-NEXT: .long [[test1_end]]-[[test1_begin]]
 ;                   ^ offset of end of handler
-; CHECKX-NEXT: .long 0
+; CHECK-NEXT: .long 0
 ;                   ^ type token slot (null for finally)
+
+; Test with a cleanup that has no cleanupret, and thus needs its unwind dest
+; inferred from an inner catchswitch
+;
+; corresponds to C# along the lines of:
+; void test2() {
+;   try {
+;     try {
+;       f(1);
+;     } fault {
+;       try {
+;         f(2);
+;       } catch(type1) {
+;       }
+;       __unreachable();
+;     }
+;   } catch(type2) {
+;   }
+; }
+;
+; CHECK-LABEL: test2:     # @test2
+; CHECK-NEXT: [[test2_begin:.*func_begin.*]]:
+define void @test2() personality i8* bitcast (void ()* @ProcessCLRException to i8*) {
+entry:
+; CHECK: .seh_endprologue
+; CHECK: [[test2_before_f1:.+]]:
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test2_after_f1:.+]]:
+  invoke void @f(i32 1)
+    to label %exit unwind label %fault
+fault:
+; CHECK: .seh_proc [[test2_fault:[^ ]+]]
+  %fault.pad = cleanuppad within none [i32 undef]
+; CHECK: .seh_endprologue
+; CHECK: [[test2_before_f2:.+]]:
+; CHECK-NEXT: movl $2, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test2_after_f2:.+]]:
+  invoke void @f(i32 2) ["funclet"(token %fault.pad)]
+    to label %unreachable unwind label %exn.dispatch.inner
+exn.dispatch.inner:
+  %catchswitch.inner = catchswitch within %fault.pad [label %catch1] unwind label %exn.dispatch.outer
+catch1:
+  %catch.pad1 = catchpad within %catchswitch.inner [i32 1]
+; CHECK: .seh_proc [[test2_catch1:[^ ]+]]
+  catchret from %catch.pad1 to label %unreachable
+exn.dispatch.outer:
+  %catchswitch.outer = catchswitch within none [label %catch2] unwind to caller
+catch2:
+  %catch.pad2 = catchpad within %catchswitch.outer [i32 2]
+; CHECK: .seh_proc [[test2_catch2:[^ ]+]]
+  catchret from %catch.pad2 to label %exit
+exit:
+  ret void
+unreachable:
+  unreachable
+; CHECK: [[test2_end:.*func_end.*]]:
+}
+
+; Now check for EH table in xdata (following standard xdata)
+; CHECK-LABEL: .section .xdata
+; standard xdata comes here
+; CHECK:      .long 3{{$}}
+;                   ^ number of funclets
+; CHECK-NEXT: .long [[test2_fault]]-[[test2_begin]]
+;                   ^ offset from L_begin to start of 1st funclet
+; CHECK-NEXT: .long [[test2_catch1]]-[[test2_begin]]
+;                   ^ offset from L_begin to start of 2nd funclet
+; CHECK-NEXT: .long [[test2_catch2]]-[[test2_begin]]
+;                   ^ offset from L_begin to start of 3rd funclet
+; CHECK-NEXT: .long [[test2_end]]-[[test2_begin]]
+;                   ^ offset from L_begin to end of last funclet
+; CHECK-NEXT: .long 4
+;                   ^ number of EH clauses
+; Clause 1: call f(1) is guarded by fault
+; CHECK-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test2_before_f1]]-[[test2_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test2_after_f1]]-[[test2_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test2_fault]]-[[test2_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test2_catch1]]-[[test2_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 2: call f(1) is also guarded by catch2
+; CHECK-NEXT: .long 0
+;                   ^ flags (0 => catch handler)
+; CHECK-NEXT: .long ([[test2_before_f1]]-[[test2_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test2_after_f1]]-[[test2_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test2_catch2]]-[[test2_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test2_end]]-[[test2_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 2
+;                   ^ type token of catch (from catchpad)
+; Clause 3: calls f(2) is guarded by catch1
+; CHECK-NEXT: .long 0
+;                   ^ flags (0 => catch handler)
+; CHECK-NEXT: .long ([[test2_before_f2]]-[[test2_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test2_after_f2]]-[[test2_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test2_catch1]]-[[test2_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test2_catch2]]-[[test2_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 1
+;                   ^ type token of catch (from catchpad)
+; Clause 4: call f(2) is also guarded by catch2
+;           This is a "duplicate" because the protected range (f(2))
+;           is in funclet fault but catch2's immediate parent
+;           is the main function, not that funclet.
+; CHECK-NEXT: .long 8
+;                   ^ flags (0 => catch handler | 8 => duplicate)
+; CHECK-NEXT: .long ([[test2_before_f2]]-[[test2_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test2_after_f2]]-[[test2_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test2_catch2]]-[[test2_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test2_end]]-[[test2_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 2
+;                   ^ type token of catch (from catchpad)
+
+; Test with several cleanups that need to infer their unwind dests from each
+; other, the inner one needing to make the inference from an invoke, ignoring
+; not-really-unwinding calls/unwind-to-caller catchswitches, as well as some
+; internal invokes/catchswitches
+;
+; Corresponds to something like:
+; void test3() {
+;   try {
+;     f(1);
+;   } fault { // fault1
+;     try {
+;       try {
+;         f(2);
+;         __unreachable();
+;       } fault { // fault2
+;         try {
+;           f(3);
+;         } fault { // fault3
+;           try {
+;             f(4);
+;           } fault { // fault4
+;             f(5); // no unwind edge (e.g. front-end knew it wouldn't throw but
+;                    didn't bother to specify nounwind)
+;             try {
+;               try {
+;                 f(6);
+;               } catch(type 1) {
+;                 goto __unreachable;
+;               }
+;             } catch (type 2) { // marked "unwinds to caller" because we allow
+;                                // that if the unwind won't be taken (see
+;                                // SimplifyUnreachable & RemoveUnwindEdge)
+;               goto _unreachable;
+;             }
+;             f(7);
+;             __unreachable();
+;           }
+;         }
+;       }
+;     } fault { // fault 5
+;     }
+;   }
+; }
+;
+; CHECK-LABEL: test3:     # @test3
+; CHECK-NEXT: [[test3_begin:.*func_begin.*]]:
+define void @test3() personality i8* bitcast (void ()* @ProcessCLRException to i8*) {
+entry:
+; CHECK: .seh_endprologue
+; CHECK: [[test3_before_f1:.+]]:
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test3_after_f1:.+]]:
+  invoke void @f(i32 1)
+    to label %exit unwind label %fault1
+fault1:
+ ; check lines below since this gets reordered to end-of-func
+  %fault.pad1 = cleanuppad within none [i32 undef]
+  invoke void @f(i32 2) ["funclet"(token %fault.pad1)]
+    to label %unreachable unwind label %fault2
+fault2:
+ ; check lines below since this gets reordered to end-of-func
+  %fault.pad2 = cleanuppad within %fault.pad1 [i32 undef]
+  invoke void @f(i32 3) ["funclet"(token %fault.pad2)]
+    to label %unreachable unwind label %fault3
+fault3:
+ ; check lines below since this gets reordered to end-of-func
+  %fault.pad3 = cleanuppad within %fault.pad2 [i32 undef]
+  invoke void @f(i32 4) ["funclet"(token %fault.pad3)]
+    to label %unreachable unwind label %fault4
+fault4:
+; CHECK: .seh_proc [[test3_fault4:[^ ]+]]
+  %fault.pad4 = cleanuppad within %fault.pad3 [i32 undef]
+; CHECK: .seh_endprologue
+  call void @f(i32 5) ["funclet"(token %fault.pad4)]
+; CHECK: [[test3_before_f6:.+]]:
+; CHECK-NEXT: movl $6, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test3_after_f6:.+]]:
+  invoke void @f(i32 6) ["funclet"(token %fault.pad4)]
+    to label %fault4.cont unwind label %exn.dispatch1
+fault4.cont:
+; CHECK: # %fault4.cont
+; CHECK: [[test3_before_f7:.+]]:
+; CHECK-NEXT: movl $7, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test3_after_f7:.+]]:
+  invoke void @f(i32 7) ["funclet"(token %fault.pad4)]
+    to label %unreachable unwind label %fault5
+exn.dispatch1:
+  %catchswitch1 = catchswitch within %fault.pad4 [label %catch1] unwind label %exn.dispatch2
+catch1:
+  %catch.pad1 = catchpad within %catchswitch1 [i32 1]
+; CHECK: .seh_proc [[test3_catch1:[^ ]+]]
+  catchret from %catch.pad1 to label %unreachable
+exn.dispatch2:
+  %catchswitch2 = catchswitch within %fault.pad4 [label %catch2] unwind to caller
+catch2:
+  %catch.pad2 = catchpad within %catchswitch2 [i32 2]
+; CHECK: .seh_proc [[test3_catch2:[^ ]+]]
+  catchret from %catch.pad2 to label %unreachable
+fault5:
+; CHECK: .seh_proc [[test3_fault5:[^ ]+]]
+  %fault.pad5 = cleanuppad within %fault.pad1 [i32 undef]
+; CHECK: .seh_endprologue
+cleanupret from %fault.pad5 unwind to caller
+exit:
+  ret void
+unreachable:
+  unreachable
+; CHECK: .seh_proc [[test3_fault3:[^ ]+]]
+; CHECK: # %fault3
+; CHECK: .seh_endprologue
+; CHECK: [[test3_before_f4:.+]]:
+; CHECK-NEXT: movl $4, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test3_after_f4:.+]]:
+; CHECK: .seh_proc [[test3_fault2:[^ ]+]]
+; CHECK: # %fault2
+; CHECK: .seh_endprologue
+; CHECK: [[test3_before_f3:.+]]:
+; CHECK-NEXT: movl $3, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test3_after_f3:.+]]:
+; CHECK: .seh_proc [[test3_fault1:[^ ]+]]
+; CHECK: # %fault1
+; CHECK: .seh_endprologue
+; CHECK: [[test3_before_f2:.+]]:
+; CHECK-NEXT: movl $2, %ecx
+; CHECK-NEXT: callq f
+; CHECK-NEXT: [[test3_after_f2:.+]]:
+; CHECK: [[test3_end:.*func_end.*]]:
+}
+
+; Now check for EH table in xdata (following standard xdata)
+; CHECK-LABEL: .section .xdata
+; standard xdata comes here
+; CHECK:      .long 7{{$}}
+;                   ^ number of funclets
+; CHECK-NEXT: .long [[test3_fault4]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 1st funclet
+; CHECK-NEXT: .long [[test3_catch1]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 2nd funclet
+; CHECK-NEXT: .long [[test3_catch2]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 3rd funclet
+; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 4th funclet
+; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 5th funclet
+; CHECK-NEXT: .long [[test3_fault2]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 6th funclet
+; CHECK-NEXT: .long [[test3_fault1]]-[[test3_begin]]
+;                   ^ offset from L_begin to start of 7th funclet
+; CHECK-NEXT: .long [[test3_end]]-[[test3_begin]]
+;                   ^ offset from L_begin to end of last funclet
+; CHECK-NEXT: .long 10
+;                   ^ number of EH clauses
+; Clause 1: call f(1) is guarded by fault1
+; CHECK-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test3_before_f1]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f1]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault1]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_end]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 3: call f(6) is guarded by catch1
+; CHECK-NEXT: .long 0
+;                   ^ flags (0 => catch handler)
+; CHECK-NEXT: .long ([[test3_before_f6]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f6]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_catch1]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_catch2]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 1
+;                   ^ type token of catch (from catchpad)
+; Clause 3: call f(6) is also guarded by catch2
+; CHECK-NEXT: .long 0
+;                   ^ flags (0 => catch handler)
+; CHECK-NEXT: .long ([[test3_before_f6]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f6]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_catch2]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 2
+;                   ^ type token of catch (from catchpad)
+; Clause 4: call f(7) is guarded by fault5
+;           This is a "duplicate" because the protected range (f(6)-f(7))
+;           is in funclet fault4 but fault5's immediate parent
+;           is fault1, not that funclet.
+; CHECK-NEXT: .long 12
+;                   ^ flags (4 => fault handler | 8 => duplicate)
+; CHECK-NEXT: .long ([[test3_before_f7]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f7]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 5: call f(4) is guarded by fault4
+; CHECK-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test3_before_f4]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f4]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault4]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_catch1]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 6: call f(4) is also guarded by fault5
+;           This is a "duplicate" because the protected range (f(4))
+;           is in funclet fault3 but fault5's immediate parent
+;           is fault1, not that funclet.
+; CHECK-NEXT: .long 12
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test3_before_f4]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f4]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 7: call f(3) is guarded by fault3
+; CHECK-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test3_before_f3]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f3]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault2]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 8: call f(3) is guarded by fault5
+;           This is a "duplicate" because the protected range (f(3))
+;           is in funclet fault2 but fault5's immediate parent
+;           is fault1, not that funclet.
+; CHECK-NEXT: .long 12
+;                   ^ flags (4 => fault handler | 8 => duplicate)
+; CHECK-NEXT: .long ([[test3_before_f3]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f3]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 9: call f(2) is guarded by fault2
+; CHECK-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test3_before_f2]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f2]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault2]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault1]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
+; Clause 10: call f(2) is guarded by fault5
+; CHECK-NEXT: .long 4
+;                   ^ flags (4 => fault handler)
+; CHECK-NEXT: .long ([[test3_before_f2]]-[[test3_begin]])+1
+;                   ^ offset of start of clause
+; CHECK-NEXT: .long ([[test3_after_f2]]-[[test3_begin]])+1
+;                   ^ offset of end of clause
+; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]]
+;                   ^ offset of start of handler
+; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]]
+;                   ^ offset of end of handler
+; CHECK-NEXT: .long 0
+;                   ^ type token slot (null for fault)
diff --git a/test/CodeGen/X86/x86-32-intrcc.ll b/test/CodeGen/X86/x86-32-intrcc.ll
index 908da3d11206..99d0044c6de6 100644
--- a/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/test/CodeGen/X86/x86-32-intrcc.ll
@@ -1,79 +1,79 @@
-; RUN: llc -mtriple=i686-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
-
-%struct.interrupt_frame = type { i32, i32, i32, i32, i32 }
-
-@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*)], section "llvm.metadata"
-
-; Spills eax, putting original esp at +4.
-; No stack adjustment if declared with no error code
-define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
-  ; CHECK-LABEL: test_isr_no_ecode:
-  ; CHECK: pushl %eax
-  ; CHECK: movl 12(%esp), %eax
-  ; CHECK: popl %eax
-  ; CHECK: iretl
-  ; CHECK0-LABEL: test_isr_no_ecode:
-  ; CHECK0: pushl %eax
-  ; CHECK0: leal 4(%esp), %eax
-  ; CHECK0: movl 8(%eax), %eax
-  ; CHECK0: popl %eax
-  ; CHECK0: iretl
-  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
-  %flags = load i32, i32* %pflags, align 4
-  call void asm sideeffect "", "r"(i32 %flags)
-  ret void
-}
-
-; Spills eax and ecx, putting original esp at +8. Stack is adjusted up another 4 bytes
-; before return, popping the error code.
-define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i32 %ecode) {
-  ; CHECK-LABEL: test_isr_ecode
-  ; CHECK: pushl %ecx
-  ; CHECK: pushl %eax
-  ; CHECK: movl 8(%esp), %eax
-  ; CHECK: movl 20(%esp), %ecx
-  ; CHECK: popl %eax
-  ; CHECK: popl %ecx
-  ; CHECK: addl $4, %esp
-  ; CHECK: iretl
-  ; CHECK0-LABEL: test_isr_ecode
-  ; CHECK0: pushl %ecx
-  ; CHECK0: pushl %eax
-  ; CHECK0: movl 8(%esp), %eax
-  ; CHECK0: leal 12(%esp), %ecx
-  ; CHECK0: movl 8(%ecx), %ecx
-  ; CHECK0: popl %eax
-  ; CHECK0: popl %ecx
-  ; CHECK0: addl $4, %esp
-  ; CHECK0: iretl
-  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
-  %flags = load i32, i32* %pflags, align 4
-  call x86_fastcallcc void asm sideeffect "", "r,r"(i32 %flags, i32 %ecode)
-  ret void
-}
-
-; All clobbered registers must be saved
-define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i32 %ecode) {
-  call void asm sideeffect "", "~{eax},~{ebx},~{ebp}"()
-  ; CHECK-LABEL: test_isr_clobbers
-  ; CHECK-SSE-NEXT: pushl %ebp
-  ; CHECK-SSE-NEXT: pushl %ebx
-  ; CHECK-SSE-NEXT; pushl %eax
-  ; CHECK-SSE-NEXT: popl %eax
-  ; CHECK-SSE-NEXT: popl %ebx
-  ; CHECK-SSE-NEXT: popl %ebp
-  ; CHECK-SSE-NEXT: addl $4, %esp
-  ; CHECK-SSE-NEXT: iretl
-  ; CHECK0-LABEL: test_isr_clobbers
-  ; CHECK0-SSE-NEXT: pushl %ebp
-  ; CHECK0-SSE-NEXT: pushl %ebx
-  ; CHECK0-SSE-NEXT; pushl %eax
-  ; CHECK0-SSE-NEXT: popl %eax
-  ; CHECK0-SSE-NEXT: popl %ebx
-  ; CHECK0-SSE-NEXT: popl %ebp
-  ; CHECK0-SSE-NEXT: addl $4, %esp
-  ; CHECK0-SSE-NEXT: iretl
-  ret void
-}
-
+; RUN: llc -mtriple=i686-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
+
+%struct.interrupt_frame = type { i32, i32, i32, i32, i32 }
+
+@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+
+; Spills eax, putting original esp at +4.
+; No stack adjustment if declared with no error code
+define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
+  ; CHECK-LABEL: test_isr_no_ecode:
+  ; CHECK: pushl %eax
+  ; CHECK: movl 12(%esp), %eax
+  ; CHECK: popl %eax
+  ; CHECK: iretl
+  ; CHECK0-LABEL: test_isr_no_ecode:
+  ; CHECK0: pushl %eax
+  ; CHECK0: leal 4(%esp), %eax
+  ; CHECK0: movl 8(%eax), %eax
+  ; CHECK0: popl %eax
+  ; CHECK0: iretl
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i32, i32* %pflags, align 4
+  call void asm sideeffect "", "r"(i32 %flags)
+  ret void
+}
+
+; Spills eax and ecx, putting original esp at +8. Stack is adjusted up another 4 bytes
+; before return, popping the error code.
+define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i32 %ecode) {
+  ; CHECK-LABEL: test_isr_ecode
+  ; CHECK: pushl %ecx
+  ; CHECK: pushl %eax
+  ; CHECK: movl 8(%esp), %eax
+  ; CHECK: movl 20(%esp), %ecx
+  ; CHECK: popl %eax
+  ; CHECK: popl %ecx
+  ; CHECK: addl $4, %esp
+  ; CHECK: iretl
+  ; CHECK0-LABEL: test_isr_ecode
+  ; CHECK0: pushl %ecx
+  ; CHECK0: pushl %eax
+  ; CHECK0: movl 8(%esp), %eax
+  ; CHECK0: leal 12(%esp), %ecx
+  ; CHECK0: movl 8(%ecx), %ecx
+  ; CHECK0: popl %eax
+  ; CHECK0: popl %ecx
+  ; CHECK0: addl $4, %esp
+  ; CHECK0: iretl
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i32, i32* %pflags, align 4
+  call x86_fastcallcc void asm sideeffect "", "r,r"(i32 %flags, i32 %ecode)
+  ret void
+}
+
+; All clobbered registers must be saved
+define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i32 %ecode) {
+  call void asm sideeffect "", "~{eax},~{ebx},~{ebp}"()
+  ; CHECK-LABEL: test_isr_clobbers
+  ; CHECK-SSE-NEXT: pushl %ebp
+  ; CHECK-SSE-NEXT: pushl %ebx
+  ; CHECK-SSE-NEXT; pushl %eax
+  ; CHECK-SSE-NEXT: popl %eax
+  ; CHECK-SSE-NEXT: popl %ebx
+  ; CHECK-SSE-NEXT: popl %ebp
+  ; CHECK-SSE-NEXT: addl $4, %esp
+  ; CHECK-SSE-NEXT: iretl
+  ; CHECK0-LABEL: test_isr_clobbers
+  ; CHECK0-SSE-NEXT: pushl %ebp
+  ; CHECK0-SSE-NEXT: pushl %ebx
+  ; CHECK0-SSE-NEXT; pushl %eax
+  ; CHECK0-SSE-NEXT: popl %eax
+  ; CHECK0-SSE-NEXT: popl %ebx
+  ; CHECK0-SSE-NEXT: popl %ebp
+  ; CHECK0-SSE-NEXT: addl $4, %esp
+  ; CHECK0-SSE-NEXT: iretl
+  ret void
+}
+
diff --git a/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/test/CodeGen/X86/x86-64-flags-intrinsics.ll
new file mode 100644
index 000000000000..4c5032aedbca
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-flags-intrinsics.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "x86_64-pc-win32"
+
+declare i64 @llvm.x86.flags.read.u64()
+declare void @llvm.x86.flags.write.u64(i64)
+
+define i64 @read_flags() {
+entry:
+  %flags = call i64 @llvm.x86.flags.read.u64()
+  ret i64 %flags
+}
+
+; CHECK-LABEL: read_flags:
+; CHECK:      pushq   %rbp
+; CHECK:      .seh_pushreg 5
+; CHECK:      movq    %rsp, %rbp
+; CHECK:      .seh_setframe 5, 0
+; CHECK:      .seh_endprologue
+; CHECK-NEXT: pushfq
+; CHECK-NEXT: popq    %rax
+; CHECK-NEXT: popq    %rbp
+
+define void @write_flags(i64 %arg) {
+entry:
+  call void @llvm.x86.flags.write.u64(i64 %arg)
+  ret void
+}
+
+; CHECK-LABEL: write_flags:
+; CHECK:      pushq   %rbp
+; CHECK:      .seh_pushreg 5
+; CHECK:      movq    %rsp, %rbp
+; CHECK:      .seh_setframe 5, 0
+; CHECK:      .seh_endprologue
+; CHECK-NEXT: pushq   %rcx
+; CHECK-NEXT: popfq
+; CHECK-NEXT: popq    %rbp
diff --git a/test/CodeGen/X86/x86-64-intrcc.ll b/test/CodeGen/X86/x86-64-intrcc.ll
index 8f70b391fa10..429209c063ca 100644
--- a/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/test/CodeGen/X86/x86-64-intrcc.ll
@@ -1,86 +1,86 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
-
-%struct.interrupt_frame = type { i64, i64, i64, i64, i64 }
-
-@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*)], section "llvm.metadata"
-
-; Spills rax, putting original esp at +8.
-; No stack adjustment if declared with no error code
-define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
-  ; CHECK-LABEL: test_isr_no_ecode:
-  ; CHECK: pushq %rax
-  ; CHECK: movq 24(%rsp), %rax
-  ; CHECK: popq %rax
-  ; CHECK: iretq
-  ; CHECK0-LABEL: test_isr_no_ecode:
-  ; CHECK0: pushq %rax
-  ; CHECK0: leaq 8(%rsp), %rax
-  ; CHECK0: movq 16(%rax), %rax
-  ; CHECK0: popq %rax
-  ; CHECK0: iretq
-  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
-  %flags = load i64, i64* %pflags, align 4
-  call void asm sideeffect "", "r"(i64 %flags)
-  ret void
-}
-
-; Spills rax and rcx, putting original rsp at +16. Stack is adjusted up another 8 bytes
-; before return, popping the error code.
-define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i64 %ecode) {
-  ; CHECK-LABEL: test_isr_ecode
-  ; CHECK: pushq %rax
-  ; CHECK: pushq %rcx
-  ; CHECK: movq 16(%rsp), %rax
-  ; CHECK: movq 40(%rsp), %rcx
-  ; CHECK: popq %rcx
-  ; CHECK: popq %rax
-  ; CHECK: addq $8, %rsp
-  ; CHECK: iretq
-  ; CHECK0-LABEL: test_isr_ecode
-  ; CHECK0: pushq %rax
-  ; CHECK0: pushq %rcx
-  ; CHECK0: movq 16(%rsp), %rax
-  ; CHECK0: leaq 24(%rsp), %rcx
-  ; CHECK0: movq 16(%rcx), %rcx
-  ; CHECK0: popq %rcx
-  ; CHECK0: popq %rax
-  ; CHECK0: addq $8, %rsp
-  ; CHECK0: iretq
-  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
-  %flags = load i64, i64* %pflags, align 4
-  call void asm sideeffect "", "r,r"(i64 %flags, i64 %ecode)
-  ret void
-}
-
-; All clobbered registers must be saved
-define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %ecode) {
-  call void asm sideeffect "", "~{rax},~{rbx},~{rbp},~{r11},~{xmm0}"()
-  ; CHECK-LABEL: test_isr_clobbers
-  ; CHECK-SSE-NEXT: pushq %rax
-  ; CHECK-SSE-NEXT; pushq %r11
-  ; CHECK-SSE-NEXT: pushq %rbp
-  ; CHECK-SSE-NEXT: pushq %rbx
-  ; CHECK-SSE-NEXT: movaps %xmm0
-  ; CHECK-SSE-NEXT: movaps %xmm0
-  ; CHECK-SSE-NEXT: popq %rbx
-  ; CHECK-SSE-NEXT: popq %rbp
-  ; CHECK-SSE-NEXT: popq %r11
-  ; CHECK-SSE-NEXT: popq %rax
-  ; CHECK-SSE-NEXT: addq $8, %rsp
-  ; CHECK-SSE-NEXT: iretq
-  ; CHECK0-LABEL: test_isr_clobbers
-  ; CHECK0-SSE-NEXT: pushq %rax
-  ; CHECK0-SSE-NEXT; pushq %r11
-  ; CHECK0-SSE-NEXT: pushq %rbp
-  ; CHECK0-SSE-NEXT: pushq %rbx
-  ; CHECK0-SSE-NEXT: movaps %xmm0
-  ; CHECK0-SSE-NEXT: movaps %xmm0
-  ; CHECK0-SSE-NEXT: popq %rbx
-  ; CHECK0-SSE-NEXT: popq %rbp
-  ; CHECK0-SSE-NEXT: popq %r11
-  ; CHECK0-SSE-NEXT: popq %rax
-  ; CHECK0-SSE-NEXT: addq $8, %rsp
-  ; CHECK0-SSE-NEXT: iretq
-  ret void
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
+
+%struct.interrupt_frame = type { i64, i64, i64, i64, i64 }
+
+@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+
+; Spills rax, putting original esp at +8.
+; No stack adjustment if declared with no error code
+define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
+  ; CHECK-LABEL: test_isr_no_ecode:
+  ; CHECK: pushq %rax
+  ; CHECK: movq 24(%rsp), %rax
+  ; CHECK: popq %rax
+  ; CHECK: iretq
+  ; CHECK0-LABEL: test_isr_no_ecode:
+  ; CHECK0: pushq %rax
+  ; CHECK0: leaq 8(%rsp), %rax
+  ; CHECK0: movq 16(%rax), %rax
+  ; CHECK0: popq %rax
+  ; CHECK0: iretq
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i64, i64* %pflags, align 4
+  call void asm sideeffect "", "r"(i64 %flags)
+  ret void
+}
+
+; Spills rax and rcx, putting original rsp at +16. Stack is adjusted up another 8 bytes
+; before return, popping the error code.
+define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i64 %ecode) {
+  ; CHECK-LABEL: test_isr_ecode
+  ; CHECK: pushq %rax
+  ; CHECK: pushq %rcx
+  ; CHECK: movq 16(%rsp), %rax
+  ; CHECK: movq 40(%rsp), %rcx
+  ; CHECK: popq %rcx
+  ; CHECK: popq %rax
+  ; CHECK: addq $8, %rsp
+  ; CHECK: iretq
+  ; CHECK0-LABEL: test_isr_ecode
+  ; CHECK0: pushq %rax
+  ; CHECK0: pushq %rcx
+  ; CHECK0: movq 16(%rsp), %rax
+  ; CHECK0: leaq 24(%rsp), %rcx
+  ; CHECK0: movq 16(%rcx), %rcx
+  ; CHECK0: popq %rcx
+  ; CHECK0: popq %rax
+  ; CHECK0: addq $8, %rsp
+  ; CHECK0: iretq
+  %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
+  %flags = load i64, i64* %pflags, align 4
+  call void asm sideeffect "", "r,r"(i64 %flags, i64 %ecode)
+  ret void
+}
+
+; All clobbered registers must be saved
+define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %ecode) {
+  call void asm sideeffect "", "~{rax},~{rbx},~{rbp},~{r11},~{xmm0}"()
+  ; CHECK-LABEL: test_isr_clobbers
+  ; CHECK-SSE-NEXT: pushq %rax
+  ; CHECK-SSE-NEXT; pushq %r11
+  ; CHECK-SSE-NEXT: pushq %rbp
+  ; CHECK-SSE-NEXT: pushq %rbx
+  ; CHECK-SSE-NEXT: movaps %xmm0
+  ; CHECK-SSE-NEXT: movaps %xmm0
+  ; CHECK-SSE-NEXT: popq %rbx
+  ; CHECK-SSE-NEXT: popq %rbp
+  ; CHECK-SSE-NEXT: popq %r11
+  ; CHECK-SSE-NEXT: popq %rax
+  ; CHECK-SSE-NEXT: addq $8, %rsp
+  ; CHECK-SSE-NEXT: iretq
+  ; CHECK0-LABEL: test_isr_clobbers
+  ; CHECK0-SSE-NEXT: pushq %rax
+  ; CHECK0-SSE-NEXT; pushq %r11
+  ; CHECK0-SSE-NEXT: pushq %rbp
+  ; CHECK0-SSE-NEXT: pushq %rbx
+  ; CHECK0-SSE-NEXT: movaps %xmm0
+  ; CHECK0-SSE-NEXT: movaps %xmm0
+  ; CHECK0-SSE-NEXT: popq %rbx
+  ; CHECK0-SSE-NEXT: popq %rbp
+  ; CHECK0-SSE-NEXT: popq %r11
+  ; CHECK0-SSE-NEXT: popq %rax
+  ; CHECK0-SSE-NEXT: addq $8, %rsp
+  ; CHECK0-SSE-NEXT: iretq
+  ret void
 }
 \ No newline at end of file
diff --git a/test/CodeGen/X86/x86-flags-intrinsics.ll b/test/CodeGen/X86/x86-flags-intrinsics.ll
new file mode 100644
index 000000000000..325de7d5f1e7
--- /dev/null
+++ b/test/CodeGen/X86/x86-flags-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "i686-pc-win32"
+
+declare i32 @llvm.x86.flags.read.u32()
+declare void @llvm.x86.flags.write.u32(i32)
+
+define i32 @read_flags() {
+entry:
+  %flags = call i32 @llvm.x86.flags.read.u32()
+  ret i32 %flags
+}
+
+; CHECK-LABEL: _read_flags:
+; CHECK:      pushl   %ebp
+; CHECK-NEXT: movl    %esp, %ebp
+; CHECK-NEXT: pushfl
+; CHECK-NEXT: popl    %eax
+; CHECK-NEXT: popl    %ebp
+
+define x86_fastcallcc void @write_flags(i32 inreg %arg) {
+entry:
+  call void @llvm.x86.flags.write.u32(i32 %arg)
+  ret void
+}
+
+; CHECK-LABEL: @write_flags@4:
+; CHECK:      pushl   %ebp
+; CHECK-NEXT: movl    %esp, %ebp
+; CHECK-NEXT: pushl   %ecx
+; CHECK-NEXT: popfl
+; CHECK-NEXT: popl    %ebp
diff --git a/test/CodeGen/X86/x86-win64-shrink-wrapping.ll b/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
index 395de686d2e2..5d9b2ba3267a 100644
--- a/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
@@ -11,10 +11,8 @@ target triple = "x86_64--windows-gnu"
 ; etc.) prior to the return and this is forbidden for Win64.
 ; CHECK-LABEL: loopInfoSaveOutsideLoop:
 ; CHECK: push
-; CHECK: push
 ; CHECK-NOT: popq
 ; CHECK: popq
-; CHECK: popq
 ; CHECK-NOT: popq
 ; CHECK-NEXT: retq
 define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 {
@@ -57,7 +55,6 @@ if.end:                                           ; preds = %if.else, %for.end
 ;
 ; Prologue code.
 ; Make sure we save the CSR used in the inline asm: rbx.
-; CHECK: pushq %rbp
 ; CHECK: pushq %rbx
 ;
 ; DISABLE: testl %ecx, %ecx
@@ -79,7 +76,6 @@ if.end:                                           ; preds = %if.else, %for.end
 ; DISABLE: jmp [[EPILOG_BB:.LBB[0-9_]+]]
 ;
 ; ENABLE-NEXT: popq %rbx
-; ENABLE-NEXT: popq %rbp
 ; ENABLE-NEXT: retq
 ;
 ; CHECK: [[ELSE_LABEL]]: # %if.else
diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll
index f3e52df54be0..b67100c87fdb 100644
--- a/test/DebugInfo/COFF/asm.ll
+++ b/test/DebugInfo/COFF/asm.ll
@@ -130,15 +130,12 @@
 ; X64-NEXT: .L{{.*}}:{{$}}
 ; X64-NEXT: [[START:.*]]:{{$}}
 ; X64:      # BB
-; X64:      pushq %rbp
-; X64-NEXT: subq    $32, %rsp
-; X64-NEXT: leaq    32(%rsp), %rbp
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[ASM_LINE:.*]]:{{$}}
 ; X64:      [[CALL_LINE:.*]]:{{$}}
 ; X64:      callq   g
 ; X64-NEXT: [[EPILOG_AND_RET:.*]]:
-; X64:      addq    $32, %rsp
-; X64-NEXT: popq %rbp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
@@ -225,22 +222,22 @@
 ; OBJ64:        ProcStart {
 ; OBJ64-NEXT:     DisplayName: f
 ; OBJ64-NEXT:     Section: f
-; OBJ64-NEXT:     CodeSize: 0x17
+; OBJ64-NEXT:     CodeSize: 0xE
 ; OBJ64-NEXT:   }
 ; OBJ64-NEXT:   ProcEnd
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
 ; OBJ64-NEXT:   Flags: 0x1
-; OBJ64-NEXT:   CodeSize: 0x17
+; OBJ64-NEXT:   CodeSize: 0xE
 ; OBJ64-NEXT:   FilenameSegment [
 ; OBJ64-NEXT:     Filename: D:\asm.c
 ; OBJ64-NEXT:     +0x0: 3
 ; FIXME: An empty __asm stmt creates an extra entry.
 ; See PR18679 for the details.
-; OBJ64-NEXT:     +0xA: 4
-; OBJ64-NEXT:     +0xC: 5
-; OBJ64-NEXT:     +0x11: 6
+; OBJ64-NEXT:     +0x4: 4
+; OBJ64-NEXT:     +0x4: 5
+; OBJ64-NEXT:     +0x9: 6
 ; OBJ64-NEXT:     ColStart: 0
 ; OBJ64-NEXT:     ColEnd: 0
 ; OBJ64-NEXT:     ColStart: 0
diff --git a/test/DebugInfo/debugmacinfo.test b/test/DebugInfo/debugmacinfo.test
index 3f95169a7a4e..b8dd56a22bdf 100644
--- a/test/DebugInfo/debugmacinfo.test
+++ b/test/DebugInfo/debugmacinfo.test
@@ -1,27 +1,27 @@
-RUN: llvm-dwarfdump -debug-dump=macro %p/Inputs/dwarfdump-macro.o \
-RUN:   | FileCheck %s -check-prefix TEST_MACINFO
-RUN: llvm-dwarfdump -debug-dump=line %p/Inputs/dwarfdump-macro.o \
-RUN:   | FileCheck %s -check-prefix TEST_LINE
-
-
-; This test verifies that llvm-dwarfdump tools know how to read .debug_macinfo
-; section. It also checks that the file numbers fits with those in the
-; .debug_line section.
-TEST_MACINFO: .debug_macinfo contents:
-TEST_MACINFO: DW_MACINFO_define - lineno: 0 macro: M3 Value3
-TEST_MACINFO: DW_MACINFO_start_file - lineno: 0 filenum: 1
-TEST_MACINFO:   DW_MACINFO_start_file - lineno: 0 filenum: 2
-TEST_MACINFO:     DW_MACINFO_define - lineno: 1 macro: M4 Value4
-TEST_MACINFO:   DW_MACINFO_end_file
-TEST_MACINFO:   DW_MACINFO_define - lineno: 1 macro: M1 Value1
-TEST_MACINFO:   DW_MACINFO_start_file - lineno: 2 filenum: 3
-TEST_MACINFO:     DW_MACINFO_undef - lineno: 4 macro: M1
-TEST_MACINFO:     DW_MACINFO_define - lineno: 5 macro: M1 NewValue1
-TEST_MACINFO:   DW_MACINFO_end_file
-TEST_MACINFO:   DW_MACINFO_define - lineno: 3 macro: M2(x,y) ((x)+(y)* Value2)
-TEST_MACINFO: DW_MACINFO_end_file
-
-TEST_LINE: .debug_line contents:
-TEST_LINE: file_names[  1]    0 0x00000000 0x00000000 dwarfdump-macro.cc
-TEST_LINE: file_names[  2]    1 0x00000000 0x00000000 dwarfdump-macro-cmd.h
-TEST_LINE: file_names[  3]    0 0x00000000 0x00000000 dwarfdump-macro.h
+RUN: llvm-dwarfdump -debug-dump=macro %p/Inputs/dwarfdump-macro.o \
+RUN:   | FileCheck %s -check-prefix TEST_MACINFO
+RUN: llvm-dwarfdump -debug-dump=line %p/Inputs/dwarfdump-macro.o \
+RUN:   | FileCheck %s -check-prefix TEST_LINE
+
+
+; This test verifies that llvm-dwarfdump tools know how to read .debug_macinfo
+; section. It also checks that the file numbers fits with those in the
+; .debug_line section.
+TEST_MACINFO: .debug_macinfo contents:
+TEST_MACINFO: DW_MACINFO_define - lineno: 0 macro: M3 Value3
+TEST_MACINFO: DW_MACINFO_start_file - lineno: 0 filenum: 1
+TEST_MACINFO:   DW_MACINFO_start_file - lineno: 0 filenum: 2
+TEST_MACINFO:     DW_MACINFO_define - lineno: 1 macro: M4 Value4
+TEST_MACINFO:   DW_MACINFO_end_file
+TEST_MACINFO:   DW_MACINFO_define - lineno: 1 macro: M1 Value1
+TEST_MACINFO:   DW_MACINFO_start_file - lineno: 2 filenum: 3
+TEST_MACINFO:     DW_MACINFO_undef - lineno: 4 macro: M1
+TEST_MACINFO:     DW_MACINFO_define - lineno: 5 macro: M1 NewValue1
+TEST_MACINFO:   DW_MACINFO_end_file
+TEST_MACINFO:   DW_MACINFO_define - lineno: 3 macro: M2(x,y) ((x)+(y)* Value2)
+TEST_MACINFO: DW_MACINFO_end_file
+
+TEST_LINE: .debug_line contents:
+TEST_LINE: file_names[  1]    0 0x00000000 0x00000000 dwarfdump-macro.cc
+TEST_LINE: file_names[  2]    1 0x00000000 0x00000000 dwarfdump-macro-cmd.h
+TEST_LINE: file_names[  3]    0 0x00000000 0x00000000 dwarfdump-macro.h
diff --git a/test/JitListener/multiple.ll b/test/JitListener/multiple.ll
index 1f69ddae4f53..1d44ec475373 100644
--- a/test/JitListener/multiple.ll
+++ b/test/JitListener/multiple.ll
@@ -3,48 +3,48 @@
 
 ; This test was created using the following file:
 ;
-;  1: int foo(int a) {
-;  2:   return a;
+;  1: int foo(int a) {
+;  2:   return a;
 ;  3: }
 ;  4:
-;  5: int bar(int a) {
-;  6:   if (a == 0) {
-;  7:     return 0;
-;  8:   }
-;  9:   return 100/a;
-; 10: }
-; 11: 
-; 12: int fubar(int a) {
-; 13:   switch (a) {
-; 14:     case 0:
-; 15:       return 10;
-; 16:     case 1:
-; 17:       return 20;
-; 18:     default:
-; 19:       return 30;
-; 20:   }
+;  5: int bar(int a) {
+;  6:   if (a == 0) {
+;  7:     return 0;
+;  8:   }
+;  9:   return 100/a;
+; 10: }
+; 11: 
+; 12: int fubar(int a) {
+; 13:   switch (a) {
+; 14:     case 0:
+; 15:       return 10;
+; 16:     case 1:
+; 17:       return 20;
+; 18:     default:
+; 19:       return 30;
+; 20:   }
 ; 21: }
 ;
 
-; CHECK: Method load [1]: bar, Size = {{[0-9]+}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
-
-; CHECK: Method load [2]: foo, Size = {{[0-9]+}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[1,2]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[1,2]}}
-
-; CHECK: Method load [3]: fubar, Size = {{[0-9]+}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
-; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
-
-; CHECK: Method unload [1]
-; CHECK: Method unload [2]
+; CHECK: Method load [1]: bar, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[5,6,7,9]}}
+
+; CHECK: Method load [2]: foo, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[1,2]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[1,2]}}
+
+; CHECK: Method load [3]: fubar, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+; CHECK:   Line info @ {{[0-9]+}}: multiple.c, line {{[12,13,15,17,19]}}
+
+; CHECK: Method unload [1]
+; CHECK: Method unload [2]
 ; CHECK: Method unload [3]
 
 ; ModuleID = 'multiple.c'
diff --git a/test/JitListener/simple.ll b/test/JitListener/simple.ll
index bfa11b7e533e..cbaa5160c6c0 100644
--- a/test/JitListener/simple.ll
+++ b/test/JitListener/simple.ll
@@ -3,14 +3,14 @@
 
 ; This test was created using the following file:
 ;
-; 1: int foo(int a) {
-; 2:   return a;
+; 1: int foo(int a) {
+; 2:   return a;
 ; 3: }
 ;
 
-; CHECK: Method load [1]: foo, Size = {{[0-9]+}}
-; CHECK:   Line info @ {{[0-9]+}}: simple.c, line 1
-; CHECK:   Line info @ {{[0-9]+}}: simple.c, line 2
+; CHECK: Method load [1]: foo, Size = {{[0-9]+}}
+; CHECK:   Line info @ {{[0-9]+}}: simple.c, line 1
+; CHECK:   Line info @ {{[0-9]+}}: simple.c, line 2
 ; CHECK: Method unload [1]
 
 ; ModuleID = 'simple.c'
diff --git a/test/MC/ARM/gas-compl-copr-reg.s b/test/MC/ARM/gas-compl-copr-reg.s
index ab0b02395831..362fd11e2183 100644
--- a/test/MC/ARM/gas-compl-copr-reg.s
+++ b/test/MC/ARM/gas-compl-copr-reg.s
@@ -5,10 +5,10 @@
 
         ldc p12, cr4, [r0, #4]
         stc p14, cr6, [r2, #-224]
-@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
-
-@ CHECK: ldc	p12, c4, [r0, #4]       @ encoding: [0x01,0x4c,0x90,0xed]
-@ CHECK: stc	p14, c6, [r2, #-224]    @ encoding: [0x38,0x6e,0x02,0xed]
-
-        ldc p12, cr4, [r0, #4]
-        stc p14, cr6, [r2, #-224]
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
+
+@ CHECK: ldc	p12, c4, [r0, #4]       @ encoding: [0x01,0x4c,0x90,0xed]
+@ CHECK: stc	p14, c6, [r2, #-224]    @ encoding: [0x38,0x6e,0x02,0xed]
+
+        ldc p12, cr4, [r0, #4]
+        stc p14, cr6, [r2, #-224]
diff --git a/test/Transforms/EarlyCSE/AArch64/ldstN.ll b/test/Transforms/EarlyCSE/AArch64/ldstN.ll
index cc1af31429e1..b457621a9b5c 100644
--- a/test/Transforms/EarlyCSE/AArch64/ldstN.ll
+++ b/test/Transforms/EarlyCSE/AArch64/ldstN.ll
@@ -1,18 +1,18 @@
-; RUN: opt -S -early-cse < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>*)
-
-; Although the store and the ld4 are using the same pointer, the
-; data can not be reused because ld4 accesses multiple elements.
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @foo() {
-entry:
-  store <4 x i16> undef, <4 x i16>* undef, align 8
-  %0 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* undef)
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %0
-; CHECK-LABEL: @foo(
-; CHECK: store
-; CHECK-NEXT: call
-; CHECK-NEXT: ret
-}
+; RUN: opt -S -early-cse < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>*)
+
+; Although the store and the ld4 are using the same pointer, the
+; data can not be reused because ld4 accesses multiple elements.
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @foo() {
+entry:
+  store <4 x i16> undef, <4 x i16>* undef, align 8
+  %0 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* undef)
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %0
+; CHECK-LABEL: @foo(
+; CHECK: store
+; CHECK-NEXT: call
+; CHECK-NEXT: ret
+}
diff --git a/test/Transforms/InferFunctionAttrs/annotate.ll b/test/Transforms/InferFunctionAttrs/annotate.ll
index 963f484eb55e..1cb7ab137c02 100644
--- a/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -16,9 +16,20 @@ declare i32* @realloc(i32*, i32)
 declare i32 @strcpy(...)
 ; CHECK: declare i32 @strcpy(...)
 
+; operator new routines
+declare i8* @_Znwj(i64)
+; CHECK: declare noalias nonnull i8* @_Znwj(i64)
+declare i8* @_Znwm(i64)
+; CHECK: declare noalias nonnull i8* @_Znwm(i64)
+
+declare void @memset_pattern16(i8*, i8*, i64)
+; CHECK: declare void @memset_pattern16(i8*, i8*, i64)
+; CHECK-POSIX: declare void @memset_pattern16(i8*, i8* readonly, i64) [[G2:#[0-9]+]]
+
 declare i32 @gettimeofday(i8*, i8*)
 ; CHECK-POSIX: declare i32 @gettimeofday(i8* nocapture, i8* nocapture) [[G0:#[0-9]+]]
 
 ; CHECK: attributes [[G0]] = { nounwind }
 ; CHECK: attributes [[G1]] = { nounwind readonly }
 ; CHECK-POSIX: attributes [[G0]] = { nounwind }
+; CHECK-POSIX: attributes [[G2]] = { argmemonly }
diff --git a/test/Transforms/InstCombine/double-float-shrink-1.ll b/test/Transforms/InstCombine/double-float-shrink-1.ll
index 63a02bbd8572..319ea3259830 100644
--- a/test/Transforms/InstCombine/double-float-shrink-1.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-1.ll
@@ -9,273 +9,286 @@ target triple = "x86_64-unknown-linux-gnu"
 ; the -enable-double-float-shrink option.
 ; PR17850: http://llvm.org/bugs/show_bug.cgi?id=17850
 
-define float @acos_test(float %f)   {
+define float @acos_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @acos(double %conv)
+   %call = call fast double @acos(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: acos_test
-; CHECK: call float @acosf(float %f)
+; CHECK-LABEL: acos_test1
+; CHECK: call fast float @acosf(float %f)
 }
 
 define double @acos_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @acos(double %conv)
+   %call = call fast double @acos(double %conv)
    ret double %call
 ; CHECK-LABEL: acos_test2
-; CHECK: call double @acos(double %conv)
+; CHECK: call fast double @acos(double %conv)
 }
 
-define float @acosh_test(float %f)   {
+define float @acosh_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @acosh(double %conv)
+   %call = call fast double @acosh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: acosh_test
-; CHECK: call float @acoshf(float %f)
+; CHECK-LABEL: acosh_test1
+; CHECK: call fast float @acoshf(float %f)
 }
 
 define double @acosh_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @acosh(double %conv)
+   %call = call fast double @acosh(double %conv)
    ret double %call
 ; CHECK-LABEL: acosh_test2
-; CHECK: call double @acosh(double %conv)
+; CHECK: call fast double @acosh(double %conv)
 }
 
-define float @asin_test(float %f)   {
+define float @asin_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @asin(double %conv)
+   %call = call fast double @asin(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: asin_test
-; CHECK: call float @asinf(float %f)
+; CHECK-LABEL: asin_test1
+; CHECK: call fast float @asinf(float %f)
 }
 
 define double @asin_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @asin(double %conv)
+   %call = call fast double @asin(double %conv)
    ret double %call
 ; CHECK-LABEL: asin_test2
-; CHECK: call double @asin(double %conv)
+; CHECK: call fast double @asin(double %conv)
 }
 
-define float @asinh_test(float %f)   {
+define float @asinh_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @asinh(double %conv)
+   %call = call fast double @asinh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: asinh_test
-; CHECK: call float @asinhf(float %f)
+; CHECK-LABEL: asinh_test1
+; CHECK: call fast float @asinhf(float %f)
 }
 
 define double @asinh_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @asinh(double %conv)
+   %call = call fast double @asinh(double %conv)
    ret double %call
 ; CHECK-LABEL: asinh_test2
-; CHECK: call double @asinh(double %conv)
+; CHECK: call fast double @asinh(double %conv)
 }
 
-define float @atan_test(float %f)   {
+define float @atan_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @atan(double %conv)
+   %call = call fast double @atan(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: atan_test
-; CHECK: call float @atanf(float %f)
+; CHECK-LABEL: atan_test1
+; CHECK: call fast float @atanf(float %f)
 }
 
 define double @atan_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @atan(double %conv)
+   %call = call fast double @atan(double %conv)
    ret double %call
 ; CHECK-LABEL: atan_test2
-; CHECK: call double @atan(double %conv)
+; CHECK: call fast double @atan(double %conv)
 }
-define float @atanh_test(float %f)   {
+
+define float @atanh_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @atanh(double %conv)
+   %call = call fast double @atanh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: atanh_test
-; CHECK: call float @atanhf(float %f)
+; CHECK-LABEL: atanh_test1
+; CHECK: call fast float @atanhf(float %f)
 }
 
 define double @atanh_test2(float %f)   {
     %conv = fpext float %f to double
-    %call = call double @atanh(double %conv)
+    %call = call fast double @atanh(double %conv)
     ret double %call
 ; CHECK-LABEL: atanh_test2
-; CHECK: call double @atanh(double %conv)
+; CHECK: call fast double @atanh(double %conv)
 }
-define float @cbrt_test(float %f)   {
+
+define float @cbrt_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @cbrt(double %conv)
+   %call = call fast double @cbrt(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: cbrt_test
-; CHECK: call float @cbrtf(float %f)
+; CHECK-LABEL: cbrt_test1
+; CHECK: call fast float @cbrtf(float %f)
 }
 
 define double @cbrt_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @cbrt(double %conv)
+   %call = call fast  double @cbrt(double %conv)
    ret double %call
 ; CHECK-LABEL: cbrt_test2
-; CHECK: call double @cbrt(double %conv)
+; CHECK: call fast double @cbrt(double %conv)
 }
-define float @exp_test(float %f)   {
+
+define float @exp_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @exp(double %conv)
+   %call = call fast double @exp(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: exp_test
-; CHECK: call float @expf(float %f)
+; CHECK-LABEL: exp_test1
+; CHECK: call fast float @expf(float %f)
 }
 
 define double @exp_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @exp(double %conv)
+   %call = call fast double @exp(double %conv)
    ret double %call
 ; CHECK-LABEL: exp_test2
-; CHECK: call double @exp(double %conv)
+; CHECK: call fast double @exp(double %conv)
 }
-define float @expm1_test(float %f)   {
+
+define float @expm1_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @expm1(double %conv)
+   %call = call fast double @expm1(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: expm1_test
-; CHECK: call float @expm1f(float %f)
+; CHECK-LABEL: expm1_test1
+; CHECK: call fast float @expm1f(float %f)
 }
 
 define double @expm1_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @expm1(double %conv)
+   %call = call fast double @expm1(double %conv)
    ret double %call
 ; CHECK-LABEL: expm1_test2
-; CHECK: call double @expm1(double %conv)
+; CHECK: call fast double @expm1(double %conv)
 }
-define float @exp10_test(float %f)   {
+
+; exp10f() doesn't exist for this triple, so it doesn't shrink.
+
+define float @exp10_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @exp10(double %conv)
+   %call = call fast double @exp10(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: exp10_test
-; CHECK: call double @exp10(double %conv)
+; CHECK-LABEL: exp10_test1
+; CHECK: call fast double @exp10(double %conv)
 }
 
 define double @exp10_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @exp10(double %conv)
+   %call = call fast double @exp10(double %conv)
    ret double %call
 ; CHECK-LABEL: exp10_test2
-; CHECK: call double @exp10(double %conv)
+; CHECK: call fast double @exp10(double %conv)
 }
-define float @log_test(float %f)   {
+
+define float @log_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log(double %conv)
+   %call = call fast double @log(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: log_test
-; CHECK: call float @logf(float %f)
+; CHECK-LABEL: log_test1
+; CHECK: call fast float @logf(float %f)
 }
 
 define double @log_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log(double %conv)
+   %call = call fast double @log(double %conv)
    ret double %call
 ; CHECK-LABEL: log_test2
-; CHECK: call double @log(double %conv)
+; CHECK: call fast double @log(double %conv)
 }
-define float @log10_test(float %f)   {
+
+define float @log10_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log10(double %conv)
+   %call = call fast double @log10(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: log10_test
-; CHECK: call float @log10f(float %f)
+; CHECK-LABEL: log10_test1
+; CHECK: call fast float @log10f(float %f)
 }
 
 define double @log10_test2(float %f) {
    %conv = fpext float %f to double
-   %call = call double @log10(double %conv)
+   %call = call fast double @log10(double %conv)
    ret double %call
 ; CHECK-LABEL: log10_test2
-; CHECK: call double @log10(double %conv)
+; CHECK: call fast double @log10(double %conv)
 }
-define float @log1p_test(float %f)   {
+
+define float @log1p_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log1p(double %conv)
+   %call = call fast double @log1p(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: log1p_test
-; CHECK: call float @log1pf(float %f)
+; CHECK-LABEL: log1p_test1
+; CHECK: call fast float @log1pf(float %f)
 }
 
 define double @log1p_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log1p(double %conv)
+   %call = call fast double @log1p(double %conv)
    ret double %call
 ; CHECK-LABEL: log1p_test2
-; CHECK: call double @log1p(double %conv)
+; CHECK: call fast double @log1p(double %conv)
 }
-define float @log2_test(float %f)   {
+
+define float @log2_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log2(double %conv)
+   %call = call fast double @log2(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: log2_test
-; CHECK: call float @log2f(float %f)
+; CHECK-LABEL: log2_test1
+; CHECK: call fast float @log2f(float %f)
 }
 
 define double @log2_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @log2(double %conv)
+   %call = call fast double @log2(double %conv)
    ret double %call
 ; CHECK-LABEL: log2_test2
-; CHECK: call double @log2(double %conv)
+; CHECK: call fast double @log2(double %conv)
 }
-define float @logb_test(float %f)   {
+
+define float @logb_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @logb(double %conv)
+   %call = call fast double @logb(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: logb_test
-; CHECK: call float @logbf(float %f)
+; CHECK-LABEL: logb_test1
+; CHECK: call fast float @logbf(float %f)
 }
 
 define double @logb_test2(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @logb(double %conv)
+   %call = call fast double @logb(double %conv)
    ret double %call
 ; CHECK-LABEL: logb_test2
-; CHECK: call double @logb(double %conv)
+; CHECK: call fast double @logb(double %conv)
 }
-define float @sin_test(float %f)   {
+
+define float @sin_test1(float %f)   {
    %conv = fpext float %f to double
-   %call = call double @sin(double %conv)
+   %call = call fast double @sin(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: sin_test
-; CHECK: call float @sinf(float %f)
+; CHECK-LABEL: sin_test1
+; CHECK: call fast float @sinf(float %f)
 }
 
 define double @sin_test2(float %f) {
    %conv = fpext float %f to double
-   %call = call double @sin(double %conv)
+   %call = call fast double @sin(double %conv)
    ret double %call
 ; CHECK-LABEL: sin_test2
-; CHECK: call double @sin(double %conv)
+; CHECK: call fast double @sin(double %conv)
 }
 
-define float @sqrt_test(float %f) {
+define float @sqrt_test1(float %f) {
    %conv = fpext float %f to double
    %call = call double @sqrt(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: sqrt_test
+; CHECK-LABEL: sqrt_test1
 ; CHECK: call float @sqrtf(float %f)
 }
 
@@ -287,12 +300,12 @@ define double @sqrt_test2(float %f) {
 ; CHECK: call double @sqrt(double %conv)
 }
 
-define float @sqrt_int_test(float %f) {
+define float @sqrt_int_test1(float %f) {
    %conv = fpext float %f to double
    %call = call double @llvm.sqrt.f64(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: sqrt_int_test
+; CHECK-LABEL: sqrt_int_test1
 ; CHECK: call float @llvm.sqrt.f32(float %f)
 }
 
@@ -304,39 +317,55 @@ define double @sqrt_int_test2(float %f) {
 ; CHECK: call double @llvm.sqrt.f64(double %conv)
 }
 
-define float @tan_test(float %f) {
+define float @tan_test1(float %f) {
    %conv = fpext float %f to double
-   %call = call double @tan(double %conv)
+   %call = call fast double @tan(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: tan_test
-; CHECK: call float @tanf(float %f)
+; CHECK-LABEL: tan_test1
+; CHECK: call fast float @tanf(float %f)
 }
 
 define double @tan_test2(float %f) {
    %conv = fpext float %f to double
-   %call = call double @tan(double %conv)
+   %call = call fast double @tan(double %conv)
    ret double %call
 ; CHECK-LABEL: tan_test2
-; CHECK: call double @tan(double %conv)
+; CHECK: call fast double @tan(double %conv)
 }
-define float @tanh_test(float %f) {
+define float @tanh_test1(float %f) {
    %conv = fpext float %f to double
-   %call = call double @tanh(double %conv)
+   %call = call fast double @tanh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; CHECK-LABEL: tanh_test
-; CHECK: call float @tanhf(float %f)
+; CHECK-LABEL: tanh_test1
+; CHECK: call fast float @tanhf(float %f)
 }
 
 define double @tanh_test2(float %f) {
    %conv = fpext float %f to double
-   %call = call double @tanh(double %conv)
+   %call = call fast double @tanh(double %conv)
    ret double %call
 ; CHECK-LABEL: tanh_test2
-; CHECK: call double @tanh(double %conv)
+; CHECK: call fast double @tanh(double %conv)
 }
 
+; 'arcp' on an fmax() is meaningless. This test just proves that
+; flags are propagated for shrunken *binary* double FP calls.
+define float @max1(float %a, float %b) {
+  %c = fpext float %a to double
+  %d = fpext float %b to double
+  %e = call arcp double @fmax(double %c, double %d)
+  %f = fptrunc double %e to float
+  ret float %f
+
+; CHECK-LABEL: max1(
+; CHECK-NEXT:  call arcp float @fmaxf(float %a, float %b)
+; CHECK-NEXT:  ret
+}
+
+declare double @fmax(double, double)
+
 declare double @tanh(double) #1
 declare double @tan(double) #1
 
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index fd563481b3ed..5bdf48b85ce7 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -733,13 +733,12 @@ declare fp128 @fminl(fp128, fp128)
 ; This should always be set when unsafe-fp-math is true, but
 ; alternate the attributes for additional test coverage.
 ; 'nsz' is implied by the definition of fmax or fmin itself.
-attributes #1 = { "no-nans-fp-math" = "true" }
 
 ; Shrink and remove the call.
-define float @max1(float %a, float %b) #0 {
+define float @max1(float %a, float %b) {
   %c = fpext float %a to double
   %d = fpext float %b to double
-  %e = call double @fmax(double %c, double %d)
+  %e = call fast double @fmax(double %c, double %d)
   %f = fptrunc double %e to float
   ret float %f
 
@@ -749,8 +748,8 @@ define float @max1(float %a, float %b) #0 {
 ; CHECK-NEXT:  ret
 }
 
-define float @max2(float %a, float %b) #1 {
-  %c = call float @fmaxf(float %a, float %b)
+define float @max2(float %a, float %b) {
+  %c = call nnan float @fmaxf(float %a, float %b)
   ret float %c
 
 ; CHECK-LABEL: max2(
@@ -760,8 +759,8 @@ define float @max2(float %a, float %b) #1 {
 }
 
 
-define double @max3(double %a, double %b) #0 {
-  %c = call double @fmax(double %a, double %b)
+define double @max3(double %a, double %b) {
+  %c = call fast double @fmax(double %a, double %b)
   ret double %c
 
 ; CHECK-LABEL: max3(
@@ -770,8 +769,8 @@ define double @max3(double %a, double %b) #0 {
 ; CHECK-NEXT:  ret
 }
 
-define fp128 @max4(fp128 %a, fp128 %b) #1 {
-  %c = call fp128 @fmaxl(fp128 %a, fp128 %b)
+define fp128 @max4(fp128 %a, fp128 %b) {
+  %c = call nnan fp128 @fmaxl(fp128 %a, fp128 %b)
   ret fp128 %c
 
 ; CHECK-LABEL: max4(
@@ -781,10 +780,10 @@ define fp128 @max4(fp128 %a, fp128 %b) #1 {
 }
 
 ; Shrink and remove the call.
-define float @min1(float %a, float %b) #1 {
+define float @min1(float %a, float %b) {
   %c = fpext float %a to double
   %d = fpext float %b to double
-  %e = call double @fmin(double %c, double %d)
+  %e = call nnan double @fmin(double %c, double %d)
   %f = fptrunc double %e to float
   ret float %f
 
@@ -794,8 +793,8 @@ define float @min1(float %a, float %b) #1 {
 ; CHECK-NEXT:  ret
 }
 
-define float @min2(float %a, float %b) #0 {
-  %c = call float @fminf(float %a, float %b)
+define float @min2(float %a, float %b) {
+  %c = call fast float @fminf(float %a, float %b)
   ret float %c
 
 ; CHECK-LABEL: min2(
@@ -804,8 +803,8 @@ define float @min2(float %a, float %b) #0 {
 ; CHECK-NEXT:  ret
 }
 
-define double @min3(double %a, double %b) #1 {
-  %c = call double @fmin(double %a, double %b)
+define double @min3(double %a, double %b) {
+  %c = call nnan double @fmin(double %a, double %b)
   ret double %c
 
 ; CHECK-LABEL: min3(
@@ -814,8 +813,8 @@ define double @min3(double %a, double %b) #1 {
 ; CHECK-NEXT:  ret
 }
 
-define fp128 @min4(fp128 %a, fp128 %b) #0 {
-  %c = call fp128 @fminl(fp128 %a, fp128 %b)
+define fp128 @min4(fp128 %a, fp128 %b) {
+  %c = call fast fp128 @fminl(fp128 %a, fp128 %b)
   ret fp128 %c
 
 ; CHECK-LABEL: min4(
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index c75c771407e5..4223660db3d6 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -72,3 +72,56 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
   ret <8 x float> %i1
 }
 
+; PR26015: https://llvm.org/bugs/show_bug.cgi?id=26015
+; The widening shuffle must be inserted before any uses.
+
+define <8 x i16> @pr26015(<4 x i16> %t0) {
+; CHECK-LABEL: @pr26015(
+; CHECK-NEXT:  %[[WIDEVEC:.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:  %[[EXT:.*]] = extractelement <4 x i16> %t0, i32 2
+; CHECK-NEXT:  %t2 = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 %[[EXT]], i32 3
+; CHECK-NEXT:  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
+; CHECK-NEXT:  %t5 = shufflevector <8 x i16> %t3, <8 x i16> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; CHECK-NEXT:  ret <8 x i16> %t5
+  %t1 = extractelement <4 x i16> %t0, i32 2
+  %t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3
+  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
+  %t4 = extractelement <4 x i16> %t0, i32 3
+  %t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7
+  ret <8 x i16> %t5
+}
+
+; PR25999: https://llvm.org/bugs/show_bug.cgi?id=25999
+; TODO: The widening shuffle could be inserted at the start of the function to allow the first extract to use it.
+
+define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
+; CHECK-LABEL: @pr25999(
+; CHECK-NEXT:  %t1 = extractelement <4 x i16> %t0, i32 2
+; CHECK-NEXT:  br i1 %b, label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:  %[[WIDEVEC:.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:  %t2 = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 %t1, i32 3
+; CHECK-NEXT:  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
+; CHECK-NEXT:  %t5 = shufflevector <8 x i16> %t3, <8 x i16> %[[WIDEVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; CHECK-NEXT:  ret <8 x i16> %t5
+; CHECK:       end:
+; CHECK-NEXT:  %a1 = add i16 %t1, 4
+; CHECK-NEXT:  %t6 = insertelement <8 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a1, i32 0
+; CHECK-NEXT:  ret <8 x i16> %t6
+
+  %t1 = extractelement <4 x i16> %t0, i32 2
+  br i1 %b, label %if, label %end
+
+if:
+  %t2 = insertelement <8 x i16> zeroinitializer, i16 %t1, i32 3
+  %t3 = insertelement <8 x i16> %t2, i16 0, i32 6
+  %t4 = extractelement <4 x i16> %t0, i32 3
+  %t5 = insertelement <8 x i16> %t3, i16 %t4, i32 7
+  ret <8 x i16> %t5
+
+end:
+  %a1 = add i16 %t1, 4
+  %t6 = insertelement <8 x i16> zeroinitializer, i16 %a1, i32 0
+  ret <8 x i16> %t6
+}
+
diff --git a/test/Transforms/InstCombine/token.ll b/test/Transforms/InstCombine/token.ll
index 0929cf7ebee1..f96b85b4f225 100644
--- a/test/Transforms/InstCombine/token.ll
+++ b/test/Transforms/InstCombine/token.ll
@@ -85,5 +85,22 @@ unreachable:
 ; CHECK:  %Y = zext i8 %B to i32
 ; CHECK:  %phi = phi i32 [ %X, %bb ], [ %Y, %cont ], [ %Y, %cont2 ]
 
+declare void @foo()
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
+define void @test4(i8 addrspace(1)* %obj) gc "statepoint-example" {
+bb:
+  unreachable
+
+unreachable:
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: define void @test4(
+; CHECK: unreachable:
+; CHECK:   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK:   ret void
+
 
 declare void @g(i32)
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 465529aef66a..b360ecb84342 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -144,7 +144,7 @@ cast.end:                                         ; preds = %cast.notnull, %entr
 ; CHECK: br i1 false, label %cast.end, label %cast.notnull
 }
 
-declare noalias i8* @_Znwm(i64)
+declare nonnull noalias i8* @_Znwm(i64)
 
 %"struct.std::nothrow_t" = type { i8 }
 @_ZSt7nothrow = external global %"struct.std::nothrow_t"
diff --git a/test/Transforms/LICM/funclet.ll b/test/Transforms/LICM/funclet.ll
new file mode 100644
index 000000000000..ef4be2969151
--- /dev/null
+++ b/test/Transforms/LICM/funclet.ll
@@ -0,0 +1,107 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc18.0.0"
+
+define void @test1(i32* %s, i1 %b) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %0 = call i32 @pure_computation()
+  br i1 %b, label %try.cont, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  invoke void @may_throw()
+          to label %while.cond unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %while.body
+  %.lcssa1 = phi i32 [ %0, %while.body ]
+  %cs = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %cp = catchpad within %cs [i8* null, i32 64, i8* null]
+  store i32 %.lcssa1, i32* %s
+  catchret from %cp to label %try.cont
+
+try.cont:                                         ; preds = %catch, %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: %[[CALL:.*]] = call i32 @pure_computation()
+; CHECK: phi i32 [ %[[CALL]]
+
+define void @test2(i32* %s, i1 %b) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %0 = call i32 @pure_computation()
+  br i1 %b, label %try.cont, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  invoke void @may_throw()
+          to label %while.cond unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %while.body
+  %.lcssa1 = phi i32 [ %0, %while.body ]
+  %cp = cleanuppad within none []
+  store i32 %.lcssa1, i32* %s
+  cleanupret from %cp unwind to caller
+
+try.cont:                                         ; preds = %catch, %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @test2(
+; CHECK:      %[[CP:.*]] = cleanuppad within none []
+; CHECK-NEXT: %[[CALL:.*]] = call i32 @pure_computation() [ "funclet"(token %[[CP]]) ]
+; CHECK-NEXT: store i32 %[[CALL]], i32* %s
+; CHECK-NEXT: cleanupret from %[[CP]] unwind to caller
+
+define void @test3(i1 %a, i1 %b, i1 %c) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %.frame = alloca i8, align 4
+  %.frame2 = alloca i8, align 4
+  %bc = bitcast i8* %.frame to i32*
+  %bc2 = bitcast i8* %.frame2 to i32*
+  br i1 %a, label %try.success.or.caught, label %forbody
+
+catch.object.Throwable:                           ; preds = %catch.dispatch
+  %cp = catchpad within %cs [i8* null, i32 64, i8* null]
+  unreachable
+
+try.success.or.caught:                            ; preds = %forcond.backedge, %0
+  ret void
+
+postinvoke:                                       ; preds = %forbody
+  br i1 %b, label %else, label %forcond.backedge
+
+forcond.backedge:                                 ; preds = %else, %postinvoke
+  br i1 %c, label %try.success.or.caught, label %forbody
+
+catch.dispatch:                                   ; preds = %else, %forbody
+  %cs = catchswitch within none [label %catch.object.Throwable] unwind to caller
+
+forbody:                                          ; preds = %forcond.backedge, %0
+  store i32 1, i32* %bc, align 4
+  store i32 2, i32* %bc2, align 4
+  invoke void @may_throw()
+          to label %postinvoke unwind label %catch.dispatch
+
+else:                                             ; preds = %postinvoke
+  invoke void @may_throw()
+          to label %forcond.backedge unwind label %catch.dispatch
+}
+
+; CHECK-LABEL: define void @test3(
+; CHECK:      catchswitch within none
+; CHECK:      store i32 1, i32* %bc, align 4
+; CHECK:      store i32 2, i32* %bc2, align 4
+
+declare void @may_throw()
+
+declare i32 @pure_computation() nounwind argmemonly readonly
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll
index 02bf5846a64c..6e9e8d4b7b6f 100644
--- a/test/Transforms/LICM/sinking.ll
+++ b/test/Transforms/LICM/sinking.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -basicaa -licm -S | FileCheck %s
 
-declare i32 @strlen(i8*) readonly
+declare i32 @strlen(i8*) readonly nounwind
 
 declare void @foo()
 
@@ -20,7 +20,7 @@ Out:		; preds = %Loop
 ; CHECK-NEXT: ret i32 %A
 }
 
-declare double @sin(double) readnone
+declare double @sin(double) readnone nounwind
 
 ; Sink readnone function out of loop with unknown memory behavior.
 define double @test2(double %X) {
diff --git a/test/Transforms/MemCpyOpt/fca2memcpy.ll b/test/Transforms/MemCpyOpt/fca2memcpy.ll
new file mode 100644
index 000000000000..75a1a8f96e2b
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -0,0 +1,72 @@
+; RUN: opt -memcpyopt -S < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%S = type { i8*, i32 }
+
+define void @copy(%S* %src, %S* %dst) {
+; CHECK-LABEL: copy
+; CHECK-NOT: load
+; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @noaliassrc(%S* noalias %src, %S* %dst) {
+; CHECK-LABEL: noaliassrc
+; CHECK-NOT: load
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @noaliasdst(%S* %src, %S* noalias %dst) {
+; CHECK-LABEL: noaliasdst
+; CHECK-NOT: load
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @destroysrc(%S* %src, %S* %dst) {
+; CHECK-LABEL: destroysrc
+; CHECK-NOT: call
+; CHECK: ret void
+  %1 = load %S, %S* %src
+  store %S zeroinitializer, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @destroynoaliassrc(%S* noalias %src, %S* %dst) {
+; CHECK-LABEL: destroynoaliassrc
+; CHECK-NOT: load
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK-NEXT: store %S zeroinitializer, %S* %src
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S zeroinitializer, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @copyalias(%S* %src, %S* %dst) {
+; CHECK-LABEL: copyalias
+; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %S, %S* %src
+; CHECK-NOT: load
+; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
+; CHECK-NEXT: store %S [[LOAD]], %S* %dst
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  %2 = load %S, %S* %src
+  store %S %1, %S* %dst
+  store %S %2, %S* %dst
+  ret void
+}
diff --git a/test/Transforms/PlaceSafepoints/leaf-function.ll b/test/Transforms/PlaceSafepoints/leaf-function.ll
new file mode 100644
index 000000000000..2f4193827ae7
--- /dev/null
+++ b/test/Transforms/PlaceSafepoints/leaf-function.ll
@@ -0,0 +1,35 @@
+; RUN: opt %s -S -place-safepoints | FileCheck %s
+
+declare void @foo() "gc-leaf-function"
+declare void @bar()
+
+; Calls of functions with the "gc-leaf-function" attribute shouldn't be turned
+; into a safepoint.  An entry safepoint should get inserted, though.
+define void @test_leaf_function() gc "statepoint-example" {
+; CHECK-LABEL: test_leaf_function
+; CHECK: gc.statepoint.p0f_isVoidf
+; CHECK-NOT: statepoint
+; CHECK-NOT: gc.result
+entry:
+  call void @foo()
+  ret void
+}
+
+define void @test_leaf_function_call() gc "statepoint-example" {
+; CHECK-LABEL: test_leaf_function_call
+; CHECK: gc.statepoint.p0f_isVoidf
+; CHECK-NOT: statepoint
+; CHECK-NOT: gc.result
+entry:
+  call void @bar() "gc-leaf-function"
+  ret void
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll b/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll
index dc468966877f..0228549025ef 100644
--- a/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll
+++ b/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll
@@ -1,31 +1,31 @@
-; RUN: opt %s -S -place-safepoints | FileCheck %s
-
-; Basic test to make sure that safepoints are placed
-; for CoreCLR GC
-
-declare void @foo()
-
-define void @test_simple_call() gc "coreclr" {
-; CHECK-LABEL: test_simple_call
-entry:
-  br label %other
-other:
-; CHECK-LABEL: other
-; CHECK: statepoint
-; CHECK-NOT: gc.result
-  call void @foo()
-  ret void
-}
-
-; This function is inlined when inserting a poll.  To avoid recursive
-; issues, make sure we don't place safepoints in it.
-declare void @do_safepoint()
-define void @gc.safepoint_poll() {
-; CHECK-LABEL: gc.safepoint_poll
-; CHECK-LABEL: entry
-; CHECK-NEXT: do_safepoint
-; CHECK-NEXT: ret void
-entry:
-  call void @do_safepoint()
-  ret void
-}
+; RUN: opt %s -S -place-safepoints | FileCheck %s
+
+; Basic test to make sure that safepoints are placed
+; for CoreCLR GC
+
+declare void @foo()
+
+define void @test_simple_call() gc "coreclr" {
+; CHECK-LABEL: test_simple_call
+entry:
+  br label %other
+other:
+; CHECK-LABEL: other
+; CHECK: statepoint
+; CHECK-NOT: gc.result
+  call void @foo()
+  ret void
+}
+
+; This function is inlined when inserting a poll.  To avoid recursive
+; issues, make sure we don't place safepoints in it.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void
+entry:
+  call void @do_safepoint()
+  ret void
+}
diff --git a/test/Transforms/Reassociate/factorize-again.ll b/test/Transforms/Reassociate/factorize-again.ll
new file mode 100644
index 000000000000..87e77945dfb8
--- /dev/null
+++ b/test/Transforms/Reassociate/factorize-again.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -reassociate < %s | FileCheck %s
+
+; CHECK-LABEL: main
+; CHECK: %2 = fsub
+; CHECK: %3 = fsub
+; CHECK: fadd fast float %3, %2
+define void @main(float, float) {
+wrapper_entry:
+  %2 = fsub float undef, %0
+  %3 = fsub float undef, %1
+  %4 = call float @llvm.rsqrt.f32(float undef)
+  %5 = fmul fast float undef, %4
+  %6 = fmul fast float %2, %4
+  %7 = fmul fast float %3, %4
+  %8 = fmul fast float %5, undef
+  %9 = fmul fast float %6, undef
+  %10 = fmul fast float %7, undef
+  %11 = fadd fast float %8, %9
+  %12 = fadd fast float %11, %10
+  %13 = call float @foo2(float %12, float 0.000000e+00)
+  %mul36 = fmul fast float %13, 1.500000e+00
+  call void @foo1(i32 4, float %mul36)
+  ret void
+}
+
+declare void @foo1(i32, float)
+
+declare float @foo2(float, float) #1
+
+declare float @llvm.rsqrt.f32(float) #1
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+
diff --git a/test/Transforms/Reassociate/secondary.ll b/test/Transforms/Reassociate/secondary.ll
index 388cd6bcb6fe..a52000ada537 100644
--- a/test/Transforms/Reassociate/secondary.ll
+++ b/test/Transforms/Reassociate/secondary.ll
@@ -6,7 +6,7 @@
 
 ; CHECK:     define
 ; CHECK-NOT: undef
-; CHECK:     %factor = mul i32 %tmp3.neg, 2
+; CHECK:     %factor = mul i32 %tmp3, -2
 ; CHECK-NOT: undef
 ; CHECK:     }
 
diff --git a/test/Transforms/SimplifyCFG/empty-catchpad.ll b/test/Transforms/SimplifyCFG/empty-catchpad.ll
new file mode 100644
index 000000000000..2926cd3f7dc7
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/empty-catchpad.ll
@@ -0,0 +1,115 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+declare void @f()
+declare void @llvm.foo(i32) nounwind
+declare void @ProcessCLRException()
+
+define void @test1() personality void ()* @ProcessCLRException {
+entry:
+  invoke void @f()
+    to label %exit unwind label %exn.dispatch
+exn.dispatch:
+  %cs = catchswitch within none [label %pad1, label %pad2] unwind to caller
+pad1:
+  %cp1 = catchpad within %cs [i32 1]
+  call void @llvm.foo(i32 1)
+  catchret from %cp1 to label %exit
+pad2:
+  %cp2 = catchpad within %cs [i32 2]
+  unreachable
+exit:
+  ret void
+}
+; Remove unreachble catch2, leave catch1 as-is
+; CHECK-LABEL: define void @test1()
+; CHECK: %cs = catchswitch within none [label %pad1] unwind to caller
+; CHECK-NOT: catchpad
+; CHECK: %cp1 = catchpad within %cs [i32 1]
+; CHECK-NOT: catchpad
+
+; Remove both catchpads and the catchswitch from exn.dispatch
+; CHECK-LABEL: define void @test2()
+define void @test2() personality void ()* @ProcessCLRException {
+entry:
+  invoke void @f()
+    to label %via.cleanup unwind label %exn.dispatch
+  ; CHECK-NOT: invoke
+  ; CHECK: call void @f()
+via.cleanup:
+  invoke void @f()
+    to label %via.catchswitch unwind label %cleanup.inner
+cleanup.inner:
+  %cp.inner = cleanuppad within none []
+  call void @llvm.foo(i32 0)
+  cleanupret from %cp.inner unwind label %exn.dispatch
+  ; CHECK: cleanupret from %cp.inner unwind to caller
+via.catchswitch:
+  invoke void @f()
+    to label %exit unwind label %dispatch.inner
+dispatch.inner:
+  %cs.inner = catchswitch within none [label %pad.inner] unwind label %exn.dispatch
+  ; CHECK: %cs.inner = catchswitch within none [label %pad.inner] unwind to caller
+pad.inner:
+  %catch.inner = catchpad within %cs.inner [i32 0]
+  ; CHECK: %catch.inner = catchpad within %cs.inner
+  call void @llvm.foo(i32 1)
+  catchret from %catch.inner to label %exit
+exn.dispatch:
+  %cs = catchswitch within none [label %pad1, label %pad2] unwind to caller
+  ; CHECK-NOT: catchswitch within
+  ; CHECK-NOT: catchpad
+pad1:
+  catchpad within %cs [i32 1]
+  unreachable
+pad2:
+  catchpad within %cs [i32 2]
+  unreachable
+exit:
+  ret void
+}
+
+; Same as @test2, but exn.dispatch catchswitch has an unwind dest that
+; preds need to be reidrected to
+; CHECK-LABEL: define void @test3()
+define void @test3() personality void ()* @ProcessCLRException {
+entry:
+  invoke void @f()
+    to label %via.cleanup unwind label %exn.dispatch
+  ; CHECK: invoke void @f()
+  ; CHECK-NEXT: to label %via.cleanup unwind label %cleanup
+via.cleanup:
+  invoke void @f()
+    to label %via.catchswitch unwind label %cleanup.inner
+cleanup.inner:
+  %cp.inner = cleanuppad within none []
+  call void @llvm.foo(i32 0)
+  cleanupret from %cp.inner unwind label %exn.dispatch
+  ; CHECK: cleanupret from %cp.inner unwind label %cleanup
+via.catchswitch:
+  invoke void @f()
+    to label %exit unwind label %dispatch.inner
+dispatch.inner:
+  %cs.inner = catchswitch within none [label %pad.inner] unwind label %exn.dispatch
+  ; CHECK: %cs.inner = catchswitch within none [label %pad.inner] unwind label %cleanup
+pad.inner:
+  %catch.inner = catchpad within %cs.inner [i32 0]
+  ; CHECK: %catch.inner = catchpad within %cs.inner
+  call void @llvm.foo(i32 1)
+  catchret from %catch.inner to label %exit
+exn.dispatch:
+  %cs = catchswitch within none [label %pad1, label %pad2] unwind label %cleanup
+  ; CHECK-NOT: catchswitch within
+  ; CHECK-NOT: catchpad
+pad1:
+  catchpad within %cs [i32 1]
+  unreachable
+pad2:
+  catchpad within %cs [i32 2]
+  unreachable
+cleanup:
+  %cp = cleanuppad within none []
+  call void @llvm.foo(i32 0)
+  cleanupret from %cp unwind to caller
+exit:
+  ret void
+}
diff --git a/test/Transforms/SimplifyCFG/wineh-unreachable.ll b/test/Transforms/SimplifyCFG/wineh-unreachable.ll
index 670119467dae..c5d6490ba5d5 100644
--- a/test/Transforms/SimplifyCFG/wineh-unreachable.ll
+++ b/test/Transforms/SimplifyCFG/wineh-unreachable.ll
@@ -81,3 +81,87 @@ catch.body:
 exit:
   unreachable
 }
+
+; CHECK-LABEL: define void @test6()
+define void @test6() personality i8* bitcast (void ()* @Personality to i8*) {
+entry:
+  invoke void @f()
+          to label %exit unwind label %catch.pad
+
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body, label %catch.body] unwind to caller
+  ; CHECK: catchswitch within none [label %catch.body] unwind to caller
+
+catch.body:
+  %catch = catchpad within %cs1 [i8* null, i32 0, i8* null]
+  catchret from %catch to label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define void @test7()
+define void @test7() personality i8* bitcast (void ()* @Personality to i8*) {
+entry:
+  invoke void @f()
+          to label %exit unwind label %catch.pad
+
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body, label %catch.body2] unwind to caller
+  ; CHECK: catchswitch within none [label %catch.body] unwind to caller
+
+catch.body:
+  %catch = catchpad within %cs1 [i8* null, i32 0, i8* null]
+  catchret from %catch to label %exit
+
+catch.body2:
+  %catch2 = catchpad within %cs1 [i8* null, i32 0, i8* null]
+  catchret from %catch2 to label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define void @test8()
+define void @test8() personality i8* bitcast (void ()* @Personality to i8*) {
+entry:
+  invoke void @f()
+          to label %exit unwind label %catch.pad
+
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body, label %catch.body2] unwind to caller
+  ; CHECK: catchswitch within none [label %catch.body] unwind to caller
+
+catch.body2:
+  %catch2 = catchpad within %cs1 [i8* null, i32 0, i8* null]
+  catchret from %catch2 to label %exit
+
+catch.body:
+  %catch = catchpad within %cs1 [i8* null, i32 0, i8* null]
+  catchret from %catch to label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define void @test9()
+define void @test9() personality i8* bitcast (void ()* @Personality to i8*) {
+entry:
+  invoke void @f()
+          to label %exit unwind label %catch.pad
+
+catch.pad:
+  %cs1 = catchswitch within none [label %catch.body, label %catch.body2] unwind to caller
+  ; CHECK: catchswitch within none [label %catch.body, label %catch.body2] unwind to caller
+
+catch.body:
+  %catch = catchpad within %cs1 [i8* null, i32 0, i8* null]
+  catchret from %catch to label %exit
+
+catch.body2:
+  %catch2 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+  catchret from %catch2 to label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Verifier/invalid-eh.ll b/test/Verifier/invalid-eh.ll
index 906b24a15c30..21e88d4dcb3d 100644
--- a/test/Verifier/invalid-eh.ll
+++ b/test/Verifier/invalid-eh.ll
@@ -2,6 +2,12 @@
 ; RUN: sed -e s/.T2:// %s | not llvm-as -disable-output 2>&1 | FileCheck --check-prefix=CHECK2 %s
 ; RUN: sed -e s/.T3:// %s | not llvm-as -disable-output 2>&1 | FileCheck --check-prefix=CHECK3 %s
 ; RUN: sed -e s/.T4:// %s | not llvm-as -disable-output 2>&1 | FileCheck --check-prefix=CHECK4 %s
+; RUN: sed -e s/.T5:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK5 %s
+; RUN: sed -e s/.T6:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK6 %s
+; RUN: sed -e s/.T7:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK7 %s
+; RUN: sed -e s/.T8:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK8 %s
+
+declare void @g()
 
 ;T1: define void @f() {
 ;T1:   entry:
@@ -36,3 +42,57 @@
 ;T4:     cleanupret from %x unwind to caller
 ;T4:     ; CHECK4: CleanupReturnInst needs to be provided a CleanupPad
 ;T4: }
+
+;T5: define void @f() personality void ()* @g {
+;T5:   entry:
+;T5:     ret void
+;T5:   switch:
+;T5:     %cs = catchswitch within none [label %catch] unwind to caller
+;T5:   catch:
+;T5:     catchpad within %cs []
+;T5:     unreachable
+;T5:   bogus:
+;T5:     cleanuppad within %cs []
+;T5:     ; CHECK5: CleanupPadInst has an invalid parent
+;T5:     unreachable
+;T5: }
+
+;T6: define void @f() personality void ()* @g {
+;T6:   entry:
+;T6:     ret void
+;T6:   switch1:
+;T6:     %cs1 = catchswitch within none [label %catch1] unwind label %catch2
+;T6:     ; CHECK6: Block containg CatchPadInst must be jumped to only by its catchswitch
+;T6:   catch1:
+;T6:     catchpad within %cs1 []
+;T6:     unreachable
+;T6:   switch2:
+;T6:     %cs2 = catchswitch within none [label %catch2] unwind to caller
+;T6:   catch2:
+;T6:     catchpad within %cs2 []
+;T6:     unreachable
+;T6: }
+
+;T7: define void @f() personality void ()* @g {
+;T7:   entry:
+;T7:     ret void
+;T7:   switch1:
+;T7:     %cs1 = catchswitch within none [label %catch1] unwind to caller
+;T7:   catch1:
+;T7:     catchpad within %cs1 []
+;T7:     unreachable
+;T7:   switch2:
+;T7:     %cs2 = catchswitch within %cs1 [label %catch2] unwind to caller
+;T7:     ; CHECK7: CatchSwitchInst has an invalid parent
+;T7:   catch2:
+;T7:     catchpad within %cs2 []
+;T7:     unreachable
+;T7: }
+
+;T8: define void @f() personality void ()* @g {
+;T8:   entry:
+;T8:     ret void
+;T8:   switch1:
+;T8:     %cs1 = catchswitch within none [ label %switch1 ] unwind to caller
+;T8:     ; CHECK8: CatchSwitchInst handlers must be catchpads
+;T8: }
diff --git a/test/tools/llvm-pdbdump/class-layout.test b/test/tools/llvm-pdbdump/class-layout.test
index c46e62b21961..a92145e59e7d 100644
--- a/test/tools/llvm-pdbdump/class-layout.test
+++ b/test/tools/llvm-pdbdump/class-layout.test
@@ -1,57 +1,57 @@
-; RUN: llvm-pdbdump -all %p/Inputs/ClassLayoutTest.pdb > %t
-; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_TEST
-; RUN: FileCheck -input-file=%t %s -check-prefix=MEMBERS_TEST
-; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_A
-; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_B
-; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_C
-; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_D
-; RUN: FileCheck -input-file=%t %s -check-prefix=UDT_KIND_TEST
-; RUN: FileCheck -input-file=%t %s -check-prefix=BITFIELD_TEST
-
-; GLOBALS_TEST: ---GLOBALS---
-; GLOBALS_TEST-DAG: int GlobalsTest::IntVar
-; GLOBALS_TEST-DAG: double GlobalsTest::DoubleVar
-; GLOBALS_TEST-DAG: GlobalsTest::Enum GlobalsTest::EnumVar
-
-; MEMBERS_TEST: ---TYPES---
-; MEMBERS_TEST: class MembersTest::A {
-; MEMBERS_TEST-DAG: typedef int NestedTypedef
-; MEMBERS_TEST-DAG: enum NestedEnum
-; MEMBERS_TEST: public:
-; MEMBERS_TEST-NEXT: void MemberFunc()
-; MEMBERS_TEST-NEXT: private:
-; MEMBERS_TEST-DAG: int IntMemberVar
-; MEMBERS_TEST-DAG: double DoubleMemberVar
-; MEMBERS_TEST: }
-
-; BASE_CLASS_A: ---TYPES---
-; BASE_CLASS_A: class BaseClassTest::A {}
-
-; BASE_CLASS_B: ---TYPES---
-; BASE_CLASS_B: class BaseClassTest::B
-; BASE_CLASS_B-NEXT: : public virtual BaseClassTest::A {
-
-; BASE_CLASS_C: ---TYPES---
-; BASE_CLASS_C: class BaseClassTest::C
-; BASE_CLASS_C-NEXT: : public virtual BaseClassTest::A {
-
-; BASE_CLASS_D: ---TYPES---
-; BASE_CLASS_D: class BaseClassTest::D
-; BASE_CLASS_D-DAG: protected BaseClassTest::B
-; BASE_CLASS_D-DAG: private BaseClassTest::C
-; BASE_CLASS_D-DAG: protected virtual BaseClassTest::A
-
-; UDT_KIND_TEST: ---TYPES---
-; UDT_KIND_TEST-DAG: union UdtKindTest::C {}
-; UDT_KIND_TEST-DAG: class UdtKindTest::B {}
-; UDT_KIND_TEST-DAG: struct UdtKindTest::A {}
-
-; BITFIELD_TEST: ---TYPES---
-; BITFIELD_TEST: struct BitFieldTest::A {
-; BITFIELD_TEST-NEXT: public:
-; BITFIELD_TEST-NEXT: +0x00 int Bits1 : 1
-; BITFIELD_TEST-NEXT: +0x00 int Bits2 : 2
-; BITFIELD_TEST-NEXT: +0x00 int Bits3 : 3
-; BITFIELD_TEST-NEXT: +0x00 int Bits4 : 4
-; BITFIELD_TEST-NEXT: +0x00 int Bits22 : 22
-; BITFIELD_TEST-NEXT: +0x04 int Offset0x04
+; RUN: llvm-pdbdump -all %p/Inputs/ClassLayoutTest.pdb > %t
+; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_TEST
+; RUN: FileCheck -input-file=%t %s -check-prefix=MEMBERS_TEST
+; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_A
+; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_B
+; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_C
+; RUN: FileCheck -input-file=%t %s -check-prefix=BASE_CLASS_D
+; RUN: FileCheck -input-file=%t %s -check-prefix=UDT_KIND_TEST
+; RUN: FileCheck -input-file=%t %s -check-prefix=BITFIELD_TEST
+
+; GLOBALS_TEST: ---GLOBALS---
+; GLOBALS_TEST-DAG: int GlobalsTest::IntVar
+; GLOBALS_TEST-DAG: double GlobalsTest::DoubleVar
+; GLOBALS_TEST-DAG: GlobalsTest::Enum GlobalsTest::EnumVar
+
+; MEMBERS_TEST: ---TYPES---
+; MEMBERS_TEST: class MembersTest::A {
+; MEMBERS_TEST-DAG: typedef int NestedTypedef
+; MEMBERS_TEST-DAG: enum NestedEnum
+; MEMBERS_TEST: public:
+; MEMBERS_TEST-NEXT: void MemberFunc()
+; MEMBERS_TEST-NEXT: private:
+; MEMBERS_TEST-DAG: int IntMemberVar
+; MEMBERS_TEST-DAG: double DoubleMemberVar
+; MEMBERS_TEST: }
+
+; BASE_CLASS_A: ---TYPES---
+; BASE_CLASS_A: class BaseClassTest::A {}
+
+; BASE_CLASS_B: ---TYPES---
+; BASE_CLASS_B: class BaseClassTest::B
+; BASE_CLASS_B-NEXT: : public virtual BaseClassTest::A {
+
+; BASE_CLASS_C: ---TYPES---
+; BASE_CLASS_C: class BaseClassTest::C
+; BASE_CLASS_C-NEXT: : public virtual BaseClassTest::A {
+
+; BASE_CLASS_D: ---TYPES---
+; BASE_CLASS_D: class BaseClassTest::D
+; BASE_CLASS_D-DAG: protected BaseClassTest::B
+; BASE_CLASS_D-DAG: private BaseClassTest::C
+; BASE_CLASS_D-DAG: protected virtual BaseClassTest::A
+
+; UDT_KIND_TEST: ---TYPES---
+; UDT_KIND_TEST-DAG: union UdtKindTest::C {}
+; UDT_KIND_TEST-DAG: class UdtKindTest::B {}
+; UDT_KIND_TEST-DAG: struct UdtKindTest::A {}
+
+; BITFIELD_TEST: ---TYPES---
+; BITFIELD_TEST: struct BitFieldTest::A {
+; BITFIELD_TEST-NEXT: public:
+; BITFIELD_TEST-NEXT: +0x00 int Bits1 : 1
+; BITFIELD_TEST-NEXT: +0x00 int Bits2 : 2
+; BITFIELD_TEST-NEXT: +0x00 int Bits3 : 3
+; BITFIELD_TEST-NEXT: +0x00 int Bits4 : 4
+; BITFIELD_TEST-NEXT: +0x00 int Bits22 : 22
+; BITFIELD_TEST-NEXT: +0x04 int Offset0x04
diff --git a/test/tools/llvm-pdbdump/enum-layout.test b/test/tools/llvm-pdbdump/enum-layout.test
index c6145533899f..f6ebb20b8c56 100644
--- a/test/tools/llvm-pdbdump/enum-layout.test
+++ b/test/tools/llvm-pdbdump/enum-layout.test
@@ -1,20 +1,20 @@
-; RUN: llvm-pdbdump -types %p/Inputs/ClassLayoutTest.pdb > %t
-; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBAL_ENUM
-; RUN: FileCheck -input-file=%t %s -check-prefix=MEMBER_ENUM
-
-; GLOBAL_ENUM: ---TYPES---
-; GLOBAL_ENUM: Enums:
-; GLOBAL_ENUM: enum GlobalsTest::Enum {
-; GLOBAL_ENUM-NEXT: Val1 = 0
-; GLOBAL_ENUM-NEXT: }
-
-; MEMBER_ENUM: ---TYPES---
-; MEMBER_ENUM: Classes:
-; MEMBER_ENUM: struct __vc_attributes::threadingAttribute {
-; MEMBER_ENUM-NEXT: enum threading_e {
-; MEMBER_ENUM-NEXT: apartment = 1
-; MEMBER_ENUM-NEXT: single = 2
-; MEMBER_ENUM-NEXT: free = 3
-; MEMBER_ENUM-NEXT: neutral = 4
-; MEMBER_ENUM-NEXT: both = 5
-; MEMBER_ENUM-NEXT: }
+; RUN: llvm-pdbdump -types %p/Inputs/ClassLayoutTest.pdb > %t
+; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBAL_ENUM
+; RUN: FileCheck -input-file=%t %s -check-prefix=MEMBER_ENUM
+
+; GLOBAL_ENUM: ---TYPES---
+; GLOBAL_ENUM: Enums:
+; GLOBAL_ENUM: enum GlobalsTest::Enum {
+; GLOBAL_ENUM-NEXT: Val1 = 0
+; GLOBAL_ENUM-NEXT: }
+
+; MEMBER_ENUM: ---TYPES---
+; MEMBER_ENUM: Classes:
+; MEMBER_ENUM: struct __vc_attributes::threadingAttribute {
+; MEMBER_ENUM-NEXT: enum threading_e {
+; MEMBER_ENUM-NEXT: apartment = 1
+; MEMBER_ENUM-NEXT: single = 2
+; MEMBER_ENUM-NEXT: free = 3
+; MEMBER_ENUM-NEXT: neutral = 4
+; MEMBER_ENUM-NEXT: both = 5
+; MEMBER_ENUM-NEXT: }
diff --git a/test/tools/llvm-pdbdump/load-address.test b/test/tools/llvm-pdbdump/load-address.test
index 7a5a4dbff673..c559b5c7dcd6 100644
--- a/test/tools/llvm-pdbdump/load-address.test
+++ b/test/tools/llvm-pdbdump/load-address.test
@@ -1,10 +1,10 @@
-; RUN: llvm-pdbdump -externals %p/Inputs/LoadAddressTest.pdb \
-; RUN:    | FileCheck --check-prefix=RVA %s
-; RUN: llvm-pdbdump -externals -load-address=0x40000000 \
-; RUN: %p/Inputs/LoadAddressTest.pdb | FileCheck --check-prefix=VA %s
-
-; RVA: ---EXTERNALS---
-; RVA: [0x00001010] _main
-
-; VA: ---EXTERNALS---
-; VA: [0x40001010] _main
+; RUN: llvm-pdbdump -externals %p/Inputs/LoadAddressTest.pdb \
+; RUN:    | FileCheck --check-prefix=RVA %s
+; RUN: llvm-pdbdump -externals -load-address=0x40000000 \
+; RUN: %p/Inputs/LoadAddressTest.pdb | FileCheck --check-prefix=VA %s
+
+; RVA: ---EXTERNALS---
+; RVA: [0x00001010] _main
+
+; VA: ---EXTERNALS---
+; VA: [0x40001010] _main
diff --git a/test/tools/llvm-symbolizer/pdb/lit.local.cfg b/test/tools/llvm-symbolizer/pdb/lit.local.cfg
index 8c58f1666400..28a895f51148 100644
--- a/test/tools/llvm-symbolizer/pdb/lit.local.cfg
+++ b/test/tools/llvm-symbolizer/pdb/lit.local.cfg
@@ -1 +1 @@
-config.unsupported = not config.have_dia_sdk
+config.unsupported = not config.have_dia_sdk