65 files changed, 1943 insertions, 642 deletions
diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index f2d3f3f0ce63..b673399e428f 100644
--- a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -78,10 +78,10 @@ define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x
 ; SKX-LABEL: test_gather_2f64
 ; SKX: Found an estimated cost of 7 {{.*}}.gather
 
-%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
 
 define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 
@@ -94,7 +94,7 @@ define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %
 ; SKX-LABEL: test_gather_4i32
 ; SKX: Found an estimated cost of 6 {{.*}}.gather
 
-%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
   ret <4 x i32> %res
 }
 
@@ -109,10 +109,10 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0)
 ; SKX-LABEL: test_gather_4i32_const_mask
 ; SKX: Found an estimated cost of 6 {{.*}}.gather
 
-%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
   ret <4 x i32> %res
 }
-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
 
 define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
 
@@ -128,7 +128,7 @@ define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind)
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -146,7 +146,7 @@ define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -164,7 +164,7 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i3
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -185,7 +185,7 @@ define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -204,7 +204,7 @@ define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i3
 
   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
   %imask = bitcast i16 %mask to <16 x i1>
-  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
   ret void
 }
 
@@ -218,11 +218,11 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
 ; SKX-LABEL: test_scatter_8i32
 ; SKX: Found an estimated cost of 10 {{.*}}.scatter
 
-  call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
   ret void
 }
 
-declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; AVX2-LABEL: test_scatter_4i32
@@ -234,7 +234,7 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; SKX-LABEL: test_scatter_4i32
 ; SKX: Found an estimated cost of 6 {{.*}}.scatter
 
-  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
   ret void
 }
 
@@ -252,7 +252,7 @@ define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask)
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
   %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
 
-  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
   ret <4 x float>%res
 }
 
@@ -270,14 +270,14 @@ define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
   %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
 
-  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
   ret <4 x float>%res
 }
 
-declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
-declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
-declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
-declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
 
 declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
diff --git a/test/Analysis/CostModel/X86/vector_gep.ll b/test/Analysis/CostModel/X86/vector_gep.ll
index e49f25871d66..17f70dfc7a7c 100644
--- a/test/Analysis/CostModel/X86/vector_gep.ll
+++ b/test/Analysis/CostModel/X86/vector_gep.ll
@@ -3,7 +3,7 @@
 %struct.S = type { [1000 x i32] }
 
 
-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 
 define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){
   %temp = insertelement <4 x i64> undef, i64 %base, i32 0
@@ -12,6 +12,6 @@ define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){
   %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
 ;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32]
   %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector
-  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
   ret <4 x i32> %res
 }
diff --git a/test/Assembler/auto_upgrade_intrinsics.ll b/test/Assembler/auto_upgrade_intrinsics.ll
index d00fe5882bcd..87ad371deaa5 100644
--- a/test/Assembler/auto_upgrade_intrinsics.ll
+++ b/test/Assembler/auto_upgrade_intrinsics.ll
@@ -85,6 +85,23 @@ define void @tests.masked.store(<2 x double>* %ptr, <2 x i1> %mask, <2 x double>
   ret void
 }
 
+declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+
+define <2 x double> @tests.masked.gather(<2 x double*> %ptr, <2 x i1> %mask, <2 x double> %passthru)  {
+; CHECK-LABEL: @tests.masked.gather(
+; CHECK: @llvm.masked.gather.v2f64.v2p0f64
+  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptr, i32 1, <2 x i1> %mask, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
+
+define void @tests.masked.scatter(<2 x double*> %ptr, <2 x i1> %mask, <2 x double> %val)  {
+; CHECK-LABEL: @tests.masked.scatter(
+; CHECK: @llvm.masked.scatter.v2f64.v2p0f64
+  call void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
 
 declare {}* @llvm.invariant.start(i64, i8* nocapture) nounwind readonly
 declare void @llvm.invariant.end({}*, i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
new file mode 100644
index 000000000000..a95e9f828b61
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
+
+; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+; IR: alloca [5 x i32]
+; ASM-LABEL: {{^}}promote_alloca_shaders:
+; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only)
+
+define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx4, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx5
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx6
+  ret void
+}
+
+; OPT-LABEL: @promote_to_vector_call_c(
+; OPT-NOT: alloca
+; OPT: extractelement <2 x i32> %{{[0-9]+}}, i32 %in
+; ASM-NOT: LDSByteSize
+define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = load i32, i32* %tmp3
+  %tmp5 = load volatile i32, i32 addrspace(1)* undef
+  %tmp6 = add i32 %tmp4, %tmp5
+  store i32 %tmp6, i32 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @no_promote_to_lds_c(
+; OPT: alloca
+; ASM-NOT: LDSByteSize
+define void @no_promote_to_lds(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-shaders.ll b/test/CodeGen/AMDGPU/promote-alloca-shaders.ll
deleted file mode 100644
index d40fca9f4fd5..000000000000
--- a/test/CodeGen/AMDGPU/promote-alloca-shaders.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
-
-; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
-; IR: alloca [5 x i32]
-; ASM-LABEL: {{^}}promote_alloca_shaders:
-; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only)
-
-define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
-entry:
-  %stack = alloca [5 x i32], align 4
-  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
-  store i32 4, i32* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
-  store i32 5, i32* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
-  %tmp2 = load i32, i32* %arrayidx4, align 4
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
-  %tmp3 = load i32, i32* %arrayidx5
-  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx6
-  ret void
-}
-
-attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 699ef6e92a4f..bef7bbe01bff 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -199,7 +199,8 @@
 ; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv6k
-; RUN: llc < %s -mtriple=armv6k-none-netbsd-gnueabi -mcpu=arm1176j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6k-none-netbsd-gnueabi -mcpu=arm1176j-s 2> %t | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: FileCheck %s < %t --allow-empty --check-prefix=CPU-SUPPORTED
 ; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv6k-none-linux-gnueabi -mcpu=arm1176j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv6m
@@ -222,6 +223,8 @@
 ; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m33 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 
+; CPU-SUPPORTED-NOT: is not a recognized processor for this target
+
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
 ; XSCALE:      .eabi_attribute 9, 1
diff --git a/test/CodeGen/ARM/load-arm.ll b/test/CodeGen/ARM/load-arm.ll
new file mode 100644
index 000000000000..3807424ece81
--- /dev/null
+++ b/test/CodeGen/ARM/load-arm.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=arm %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s
+
+; We ended up feeding a deleted node back to TableGen when we converted "Off *
+; 410" into "(Off * 205) << 1", where the multiplication already existed in the
+; DAG.
+
+; CHECK-LABEL: addrmode_cse_mutation:
+; CHECK: {{mul|muls}}    [[OFFSET:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: {{ldrb|ldrb.w}} {{r[0-9]+}}, [r0, [[OFFSET]], lsl #3]
+define i32 @addrmode_cse_mutation(i8* %base, i32 %count) {
+  %offset = mul i32 %count, 277288
+  %ptr = getelementptr i8, i8* %base, i32 %offset
+  %val = load volatile i8, i8* %ptr
+  %res = mul i32 %count, 34661
+  ret i32 %res
+}
+
+; CHECK-LABEL: addrmode_cse_multi_use:
+; CHECK-NOT: {{ldrb|ldrb.w}} {{r[0-9]+}}, [{{r[0-9]+}}, {{r[0-9]+}}, lsl #3]
+define i32 @addrmode_cse_multi_use(i8* %base, i32 %count) {
+  %offset = mul i32 %count, 277288
+  %ptr = getelementptr i8, i8* %base, i32 %offset
+  %val = load volatile i8, i8* %ptr
+  %res = mul i32 %count, 34661
+  %res.1 = add i32 %res, %offset
+  ret i32 %res.1
+}
diff --git a/test/CodeGen/AVR/brind.ll b/test/CodeGen/AVR/brind.ll
index f92038d10829..ec8262e84a95 100644
--- a/test/CodeGen/AVR/brind.ll
+++ b/test/CodeGen/AVR/brind.ll
@@ -4,8 +4,6 @@
 
 define i8 @brind(i8 %p) {
 ; CHECK-LABEL: brind:
-; CHECK: ld r30
-; CHECK: ldd r31
 ; CHECK: ijmp
 entry:
   %idxprom = sext i8 %p to i16
diff --git a/test/CodeGen/AVR/dynalloca.ll b/test/CodeGen/AVR/dynalloca.ll
index 13f503015f9f..6aa776e2de6f 100644
--- a/test/CodeGen/AVR/dynalloca.ll
+++ b/test/CodeGen/AVR/dynalloca.ll
@@ -69,9 +69,9 @@ define void @dynalloca2(i16 %x) {
 ; SP restore
 ; CHECK: in r0, 63
 ; CHECK-NEXT: cli
-; CHECK-NEXT: out 62, r29
+; CHECK-NEXT: out 62, r7
 ; CHECK-NEXT: out 63, r0
-; CHECK-NEXT: out 61, r28
+; CHECK-NEXT: out 61, r6
   %vla = alloca i16, i16 %x
   call void @foo2(i16* %vla, i64 0, i64 0, i64 0)
   ret void
diff --git a/test/CodeGen/AVR/inline-asm/inline-asm.ll b/test/CodeGen/AVR/inline-asm/inline-asm.ll
index 88d0c3af2e88..26f90806781e 100644
--- a/test/CodeGen/AVR/inline-asm/inline-asm.ll
+++ b/test/CodeGen/AVR/inline-asm/inline-asm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=avr -mattr=movw -no-integrated-as | FileCheck %s
+; XFAIL: *
 
 ; CHECK-LABEL: no_operands:
 define void @no_operands() {
diff --git a/test/CodeGen/BPF/reloc.ll b/test/CodeGen/BPF/reloc.ll
new file mode 100644
index 000000000000..75dbebf311e3
--- /dev/null
+++ b/test/CodeGen/BPF/reloc.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=bpfel -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+%struct.bpf_context = type { i64, i64, i64, i64, i64, i64, i64 }
+%struct.sk_buff = type { i64, i64, i64, i64, i64, i64, i64 }
+%struct.net_device = type { i64, i64, i64, i64, i64, i64, i64 }
+
+@bpf_prog1.devname = private unnamed_addr constant [3 x i8] c"lo\00", align 1
+@bpf_prog1.fmt = private unnamed_addr constant [15 x i8] c"skb %x dev %x\0A\00", align 1
+
+; Function Attrs: norecurse
+define i32 @bpf_prog1(%struct.bpf_context* nocapture %ctx) #0 section "events/net/netif_receive_skb" {
+  %devname = alloca [3 x i8], align 1
+  %fmt = alloca [15 x i8], align 1
+  %1 = getelementptr inbounds [3 x i8], [3 x i8]* %devname, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @bpf_prog1.devname, i64 0, i64 0), i64 3, i32 1, i1 false)
+  %2 = getelementptr inbounds %struct.bpf_context, %struct.bpf_context* %ctx, i64 0, i32 0
+  %3 = load i64, i64* %2, align 8
+  %4 = inttoptr i64 %3 to %struct.sk_buff*
+  %5 = getelementptr inbounds %struct.sk_buff, %struct.sk_buff* %4, i64 0, i32 2
+  %6 = bitcast i64* %5 to i8*
+  %7 = call i8* inttoptr (i64 4 to i8* (i8*)*)(i8* %6) #1
+  %8 = call i32 inttoptr (i64 9 to i32 (i8*, i8*, i32)*)(i8* %7, i8* %1, i32 2) #1
+  %9 = icmp eq i32 %8, 0
+  br i1 %9, label %10, label %13
+
+; <label>:10                                      ; preds = %0
+  %11 = getelementptr inbounds [15 x i8], [15 x i8]* %fmt, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* getelementptr inbounds ([15 x i8], [15 x i8]* @bpf_prog1.fmt, i64 0, i64 0), i64 15, i32 1, i1 false)
+  %12 = call i32 (i8*, i32, ...) inttoptr (i64 11 to i32 (i8*, i32, ...)*)(i8* %11, i32 15, %struct.sk_buff* %4, i8* %7) #1
+  br label %13
+
+; <label>:13                                      ; preds = %10, %0
+  ret i32 0
+
+; CHECK-RELOC: file format ELF64-BPF
+; CHECK-RELOC: RELOCATION RECORDS FOR [.rel.eh_frame]:
+; CHECK-RELOC: R_BPF_64_64 events/net/netif_receive_skb
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #1
+
+attributes #0 = { norecurse }
diff --git a/test/CodeGen/Hexagon/adjust-latency-stackST.ll b/test/CodeGen/Hexagon/adjust-latency-stackST.ll
new file mode 100644
index 000000000000..915db91635f1
--- /dev/null
+++ b/test/CodeGen/Hexagon/adjust-latency-stackST.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=hexagon -disable-post-ra < %s | FileCheck %s
+
+; Make sure that if there's only one store to the stack, it gets packetized
+; with allocframe as there's a latency of 2 cycles between allocframe and
+; the following store if not in the same packet.
+
+; CHECK: {
+; CHECK: memd(r29
+; CHECK-NOT: {
+; CHECK: allocframe
+; CHECK: }
+; CHECK: = memw(gp+#G)
+
+%struct.0 = type { %struct.0*, i32, %struct.2 }
+%struct.1 = type { i32, i32, [31 x i8] }
+%struct.2 = type { %struct.1 }
+
+@G = common global %struct.0* null, align 4
+
+define i32 @test(%struct.0* nocapture %a0) #0 {
+b1:
+  %v2 = alloca %struct.0*, align 4
+  %v3 = bitcast %struct.0** %v2 to i8*
+  %v4 = getelementptr inbounds %struct.0, %struct.0* %a0, i32 0, i32 0
+  %v5 = load %struct.0*, %struct.0** %v4, align 4
+  store %struct.0* %v5, %struct.0** %v2, align 4
+  %v6 = bitcast %struct.0* %v5 to i8*
+  %v7 = load i8*, i8** bitcast (%struct.0** @G to i8**), align 4
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %v6, i8* %v7, i32 48, i32 4, i1 false)
+  %v8 = getelementptr inbounds %struct.0, %struct.0* %a0, i32 0, i32 2, i32 0, i32 1
+  store i32 5, i32* %v8, align 4
+  %v9 = getelementptr inbounds %struct.0, %struct.0* %v5, i32 0, i32 2, i32 0, i32 1
+  store i32 5, i32* %v9, align 4
+  %v10 = bitcast %struct.0* %a0 to i32*
+  %v11 = load i32, i32* %v10, align 4
+  %v12 = bitcast %struct.0* %v5 to i32*
+  store i32 %v11, i32* %v12, align 4
+  %v13 = call i32 bitcast (i32 (...)* @f0 to i32 (%struct.0**)*)(%struct.0** nonnull %v2)
+  %v14 = load %struct.0*, %struct.0** %v2, align 4
+  %v15 = getelementptr inbounds %struct.0, %struct.0* %v14, i32 0, i32 1
+  %v16 = load i32, i32* %v15, align 4
+  %v17 = icmp eq i32 %v16, 0
+  br i1 %v17, label %b18, label %b32
+
+b18:                                              ; preds = %b1
+  %v19 = bitcast %struct.0** %v2 to i32**
+  %v20 = getelementptr inbounds %struct.0, %struct.0* %v14, i32 0, i32 2, i32 0, i32 1
+  store i32 6, i32* %v20, align 4
+  %v21 = getelementptr inbounds %struct.0, %struct.0* %a0, i32 0, i32 2, i32 0, i32 0
+  %v22 = load i32, i32* %v21, align 4
+  %v23 = getelementptr inbounds %struct.0, %struct.0* %v14, i32 0, i32 2, i32 0, i32 0
+  %v24 = call i32 bitcast (i32 (...)* @f1 to i32 (i32, i32*)*)(i32 %v22, i32* %v23)
+  %v25 = load i32*, i32** bitcast (%struct.0** @G to i32**), align 4
+  %v26 = load i32, i32* %v25, align 4
+  %v27 = load i32*, i32** %v19, align 4
+  store i32 %v26, i32* %v27, align 4
+  %v28 = load %struct.0*, %struct.0** %v2, align 4
+  %v29 = getelementptr inbounds %struct.0, %struct.0* %v28, i32 0, i32 2, i32 0, i32 1
+  %v30 = load i32, i32* %v29, align 4
+  %v31 = call i32 bitcast (i32 (...)* @f2 to i32 (i32, i32, i32*)*)(i32 %v30, i32 10, i32* %v29)
+  br label %b36
+
+b32:                                              ; preds = %b1
+  %v33 = bitcast %struct.0* %a0 to i8**
+  %v34 = load i8*, i8** %v33, align 4
+  %v35 = bitcast %struct.0* %a0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %v35, i8* %v34, i32 48, i32 4, i1 false)
+  br label %b36
+
+b36:                                              ; preds = %b32, %b18
+  ret i32 undef
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) #1
+
+declare i32 @f0(...) #0
+declare i32 @f1(...) #0
+declare i32 @f2(...) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/multi-cycle.ll b/test/CodeGen/Hexagon/multi-cycle.ll
new file mode 100644
index 000000000000..fc021821af38
--- /dev/null
+++ b/test/CodeGen/Hexagon/multi-cycle.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=hexagon -O2 < %s | FileCheck %s
+
+; CHECK: v{{[0-9]+}}.h{{ *}}={{ *}}vadd(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK: }
+; CHECK: {
+; CHECK: v{{[0-9]+}}{{ *}}={{ *}}valign(v{{[0-9]+}},v{{[0-9]+}},r{{[0-9]+}})
+; CHECK: }
+; CHECK: {
+; CHECK: v{{[0-9]+}}{{ *}}={{ *}}valign(v{{[0-9]+}},v{{[0-9]+}},r{{[0-9]+}})
+
+target triple = "hexagon"
+
+@ZERO = global <16 x i32> zeroinitializer, align 64
+
+define void @fred(i16* nocapture readonly %a0, i32 %a1, i32 %a2, i16* nocapture %a3) #0 {
+b4:
+  %v5 = bitcast i16* %a0 to <16 x i32>*
+  %v6 = getelementptr inbounds i16, i16* %a0, i32 %a1
+  %v7 = bitcast i16* %v6 to <16 x i32>*
+  %v8 = mul nsw i32 %a1, 2
+  %v9 = getelementptr inbounds i16, i16* %a0, i32 %v8
+  %v10 = bitcast i16* %v9 to <16 x i32>*
+  %v11 = load <16 x i32>, <16 x i32>* %v5, align 64, !tbaa !1
+  %v12 = load <16 x i32>, <16 x i32>* %v7, align 64, !tbaa !1
+  %v13 = load <16 x i32>, <16 x i32>* %v10, align 64, !tbaa !1
+  %v14 = load <16 x i32>, <16 x i32>* @ZERO, align 64, !tbaa !1
+  %v15 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %v14, <16 x i32> %v14)
+  %v16 = sdiv i32 %a2, 32
+  %v17 = icmp sgt i32 %a2, 31
+  br i1 %v17, label %b18, label %b66
+
+b18:                                              ; preds = %b4
+  %v19 = add i32 %v8, 32
+  %v20 = add i32 %a1, 32
+  %v21 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v12, <16 x i32> %v12)
+  %v22 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v11, <16 x i32> %v13)
+  %v23 = getelementptr inbounds i16, i16* %a0, i32 %v19
+  %v24 = getelementptr inbounds i16, i16* %a0, i32 %v20
+  %v25 = getelementptr inbounds i16, i16* %a0, i32 32
+  %v26 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %v11, <16 x i32> %v13)
+  %v27 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v22, <16 x i32> %v21)
+  %v28 = bitcast i16* %v23 to <16 x i32>*
+  %v29 = bitcast i16* %v24 to <16 x i32>*
+  %v30 = bitcast i16* %v25 to <16 x i32>*
+  %v31 = bitcast i16* %a3 to <16 x i32>*
+  br label %b32
+
+b32:                                              ; preds = %b32, %b18
+  %v33 = phi i32 [ 0, %b18 ], [ %v63, %b32 ]
+  %v34 = phi <16 x i32>* [ %v31, %b18 ], [ %v62, %b32 ]
+  %v35 = phi <16 x i32>* [ %v28, %b18 ], [ %v46, %b32 ]
+  %v36 = phi <16 x i32>* [ %v29, %b18 ], [ %v44, %b32 ]
+  %v37 = phi <16 x i32>* [ %v30, %b18 ], [ %v42, %b32 ]
+  %v38 = phi <16 x i32> [ %v15, %b18 ], [ %v39, %b32 ]
+  %v39 = phi <16 x i32> [ %v26, %b18 ], [ %v56, %b32 ]
+  %v40 = phi <16 x i32> [ %v27, %b18 ], [ %v51, %b32 ]
+  %v41 = phi <16 x i32> [ %v15, %b18 ], [ %v40, %b32 ]
+  %v42 = getelementptr inbounds <16 x i32>, <16 x i32>* %v37, i32 1
+  %v43 = load <16 x i32>, <16 x i32>* %v37, align 64, !tbaa !1
+  %v44 = getelementptr inbounds <16 x i32>, <16 x i32>* %v36, i32 1
+  %v45 = load <16 x i32>, <16 x i32>* %v36, align 64, !tbaa !1
+  %v46 = getelementptr inbounds <16 x i32>, <16 x i32>* %v35, i32 1
+  %v47 = load <16 x i32>, <16 x i32>* %v35, align 64, !tbaa !1
+  %v48 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v43, <16 x i32> %v47)
+  %v49 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v45, <16 x i32> %v45)
+  %v50 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v40, <16 x i32> %v41, i32 62)
+  %v51 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v48, <16 x i32> %v49)
+  %v52 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v51, <16 x i32> %v40, i32 2)
+  %v53 = tail call <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32> %v50, <16 x i32> %v52)
+  %v54 = getelementptr inbounds <16 x i32>, <16 x i32>* %v34, i32 1
+  store <16 x i32> %v53, <16 x i32>* %v34, align 64, !tbaa !1
+  %v55 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v39, <16 x i32> %v38, i32 62)
+  %v56 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %v43, <16 x i32> %v47)
+  %v57 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v56, <16 x i32> %v39, i32 2)
+  %v58 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v39, <16 x i32> %v39)
+  %v59 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v58, <16 x i32> %v55)
+  %v60 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %v59, <16 x i32> %v57)
+  %v61 = tail call <16 x i32> @llvm.hexagon.V6.vabsh(<16 x i32> %v60)
+  %v62 = getelementptr inbounds <16 x i32>, <16 x i32>* %v34, i32 2
+  store <16 x i32> %v61, <16 x i32>* %v54, align 64, !tbaa !1
+  %v63 = add nsw i32 %v33, 1
+  %v64 = icmp slt i32 %v63, %v16
+  br i1 %v64, label %b32, label %b65
+
+b65:                                              ; preds = %b32
+  br label %b66
+
+b66:                                              ; preds = %b65, %b4
+  ret void
+}
+
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32>, <16 x i32>, i32) #1
+declare <16 x i32> @llvm.hexagon.V6.vabsdiffh(<16 x i32>, <16 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vabsh(<16 x i32>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx" }
+attributes #1 = { nounwind readnone }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/plt-rel.ll b/test/CodeGen/Hexagon/plt-rel.ll
new file mode 100644
index 000000000000..1d38cf32b886
--- /dev/null
+++ b/test/CodeGen/Hexagon/plt-rel.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon -relocation-model=pic -mattr=+long-calls < %s | FileCheck --check-prefix=CHECK-LONG %s
+; RUN: llc -march=hexagon -relocation-model=pic < %s | FileCheck %s
+
+; CHECK-LONG: call ##_ZL13g_usr1_called@GDPLT
+; CHECK-LONG-NOT: call _ZL13g_usr1_called@GDPLT
+; CHECK: call _ZL13g_usr1_called@GDPLT
+; CHECK-NOT: call ##_ZL13g_usr1_called@GDPLT
+
+
+target triple = "hexagon"
+
+@_ZL13g_usr1_called = internal thread_local global i32 0, align 4
+
+; Function Attrs: norecurse nounwind
+define void @_Z14SigUsr1Handleri(i32) local_unnamed_addr #0 {
+entry:
+  store volatile i32 1, i32* @_ZL13g_usr1_called, align 4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define zeroext i1 @_Z27CheckForMonitorCancellationv() local_unnamed_addr #0 {
+entry:
+  %0 = load volatile i32, i32* @_ZL13g_usr1_called, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  store volatile i32 0, i32* @_ZL13g_usr1_called, align 4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %.sink = phi i1 [ true, %if.then ], [ false, %entry ]
+  ret i1 %.sink
+}
+
+attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx" }
diff --git a/test/CodeGen/PowerPC/shift_mask.ll b/test/CodeGen/PowerPC/shift_mask.ll
index 91226a336721..e9ca9b0bdf02 100644
--- a/test/CodeGen/PowerPC/shift_mask.ll
+++ b/test/CodeGen/PowerPC/shift_mask.ll
@@ -49,8 +49,6 @@ define i64 @test003(i64 %a, i64 %b) {
 define <16 x i8> @test010(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test010:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisb 4, 7
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vslb 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -61,8 +59,6 @@ define <16 x i8> @test010(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @test011(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test011:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltish 4, 15
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vslh 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -73,10 +69,6 @@ define <8 x i16> @test011(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @test012(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test012:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vslw 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -87,11 +79,6 @@ define <4 x i32> @test012(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test013(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test013:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
-; CHECK-NEXT:    addi 3, 3, .LCPI7_0@toc@l
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 36, 0
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsld 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <2 x i64> %b, <i64 63, i64 63>
@@ -148,8 +135,6 @@ define i64 @test103(i64 %a, i64 %b) {
 define <16 x i8> @test110(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test110:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisb 4, 7
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrb 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -160,8 +145,6 @@ define <16 x i8> @test110(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @test111(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test111:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltish 4, 15
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrh 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -172,10 +155,6 @@ define <8 x i16> @test111(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @test112(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test112:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrw 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -186,11 +165,6 @@ define <4 x i32> @test112(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test113(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test113:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    addis 3, 2, .LCPI15_0@toc@ha
-; CHECK-NEXT:    addi 3, 3, .LCPI15_0@toc@l
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 36, 0
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrd 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <2 x i64> %b, <i64 63, i64 63>
@@ -247,8 +221,6 @@ define i64 @test203(i64 %a, i64 %b) {
 define <16 x i8> @test210(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test210:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisb 4, 7
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrab 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -259,8 +231,6 @@ define <16 x i8> @test210(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @test211(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test211:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltish 4, 15
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrah 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -271,10 +241,6 @@ define <8 x i16> @test211(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @test212(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test212:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsraw 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -285,11 +251,6 @@ define <4 x i32> @test212(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @test213(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test213:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    addis 3, 2, .LCPI23_0@toc@ha
-; CHECK-NEXT:    addi 3, 3, .LCPI23_0@toc@l
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 36, 0
-; CHECK-NEXT:    xxland 35, 35, 36
 ; CHECK-NEXT:    vsrad 2, 2, 3
 ; CHECK-NEXT:    blr
   %rem = and <2 x i64> %b, <i64 63, i64 63>
diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll
index 6fc07cd84dea..5e95cd832789 100644
--- a/test/CodeGen/X86/addcarry.ll
+++ b/test/CodeGen/X86/addcarry.ll
@@ -190,9 +190,9 @@ entry:
 define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: shiftadd:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    leaq (%rdx,%rcx), %rax
 ; CHECK-NEXT:    addq %rsi, %rdi
-; CHECK-NEXT:    adcq $0, %rax
+; CHECK-NEXT:    adcq %rcx, %rdx
+; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = zext i64 %a to i128
diff --git a/test/CodeGen/X86/lwp-intrinsics-x86_64.ll b/test/CodeGen/X86/lwp-intrinsics-x86_64.ll
new file mode 100644
index 000000000000..9ee95267fc33
--- /dev/null
+++ b/test/CodeGen/X86/lwp-intrinsics-x86_64.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+lwp | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver1 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver3 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s --check-prefix=X64
+
+define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
+; X64-LABEL: test_lwpins64_rri:
+; X64:       # BB#0:
+; X64-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2309737967)
+  ret i8 %1
+}
+
+define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
+; X64-LABEL: test_lwpins64_rmi:
+; X64:       # BB#0:
+; X64-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %a1 = load i32, i32 *%p1
+  %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 1985229328)
+  ret i8 %1
+}
+
+define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
+; X64-LABEL: test_lwpval64_rri:
+; X64:       # BB#0:
+; X64-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; X64-NEXT:    retq
+  tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 4275878552)
+  ret void
+}
+
+define void @test_lwpval64_rmi(i64 %a0, i32 *%p1) nounwind {
+; X64-LABEL: test_lwpval64_rmi:
+; X64:       # BB#0:
+; X64-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; X64-NEXT:    retq
+  %a1 = load i32, i32 *%p1
+  tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 305419896)
+  ret void
+}
+
+declare i8 @llvm.x86.lwpins64(i64, i32, i32) nounwind
+declare void @llvm.x86.lwpval64(i64, i32, i32) nounwind
diff --git a/test/CodeGen/X86/lwp-intrinsics.ll b/test/CodeGen/X86/lwp-intrinsics.ll
new file mode 100644
index 000000000000..c949bc806083
--- /dev/null
+++ b/test/CodeGen/X86/lwp-intrinsics.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+lwp | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver3 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver4 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+lwp | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver1 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver3 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s --check-prefix=X64
+
+define void @test_llwpcb(i8 *%a0) nounwind {
+; X86-LABEL: test_llwpcb:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    llwpcb %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_llwpcb:
+; X64:       # BB#0:
+; X64-NEXT:    llwpcb %rdi
+; X64-NEXT:    retq
+  tail call void @llvm.x86.llwpcb(i8 *%a0)
+  ret void
+}
+
+define i8* @test_slwpcb(i8 *%a0) nounwind {
+; X86-LABEL: test_slwpcb:
+; X86:       # BB#0:
+; X86-NEXT:    slwpcb %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_slwpcb:
+; X64:       # BB#0:
+; X64-NEXT:    slwpcb %rax
+; X64-NEXT:    retq
+  %1 = tail call i8* @llvm.x86.slwpcb()
+  ret i8 *%1
+}
+
+define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: test_lwpins32_rri:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lwpins32_rri:
+; X64:       # BB#0:
+; X64-NEXT:    addl %esi, %esi
+; X64-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %1 = add i32 %a1, %a1
+  %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %1, i32 2309737967)
+  ret i8 %2
+}
+
+define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
+; X86-LABEL: test_lwpins32_rmi:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    lwpins $1985229328, (%eax), %ecx # imm = 0x76543210
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lwpins32_rmi:
+; X64:       # BB#0:
+; X64-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %a1 = load i32, i32 *%p1
+  %1 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 1985229328)
+  ret i8 %1
+}
+
+define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: test_lwpval32_rri:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lwpval32_rri:
+; X64:       # BB#0:
+; X64-NEXT:    addl %esi, %esi
+; X64-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; X64-NEXT:    retq
+  %1 = add i32 %a1, %a1
+  tail call void @llvm.x86.lwpval32(i32 %a0, i32 %1, i32 4275878552)
+  ret void
+}
+
+define void @test_lwpval32_rmi(i32 %a0, i32 *%p1) nounwind {
+; X86-LABEL: test_lwpval32_rmi:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    lwpval $305419896, (%eax), %ecx # imm = 0x12345678
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lwpval32_rmi:
+; X64:       # BB#0:
+; X64-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; X64-NEXT:    retq
+  %a1 = load i32, i32 *%p1
+  tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 305419896)
+  ret void
+}
+
+declare void @llvm.x86.llwpcb(i8*) nounwind
+declare i8* @llvm.x86.slwpcb() nounwind
+declare i8 @llvm.x86.lwpins32(i32, i32, i32) nounwind
+declare void @llvm.x86.lwpval32(i32, i32, i32) nounwind
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 1a15cab97e2e..29a662fb217e 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -54,13 +54,13 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
-declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
-declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
-declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
 
 
 ; SCALAR-LABEL: test2
@@ -111,7 +111,7 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
   %imask = bitcast i16 %mask to <16 x i1>
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
   ret <16 x float> %res
 }
 
@@ -152,7 +152,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
   %imask = bitcast i16 %mask to <16 x i1>
-  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
   ret <16 x i32> %res
 }
 
@@ -205,8 +205,8 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
 
   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
   %imask = bitcast i16 %mask to <16 x i1>
-  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
-  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
+  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
+  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
   %res = add <16 x i32> %gt1, %gt2
   ret <16 x i32> %res
 }
@@ -270,13 +270,13 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
 
   %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
   %imask = bitcast i16 %mask to <16 x i1>
-  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
-  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
   ret void
 }
 
-declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
-declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
 
 
 ; SCALAR-LABEL: test6
@@ -326,9 +326,9 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
 ; SKX_32-NEXT:    vmovdqa %ymm2, %ymm0
 ; SKX_32-NEXT:    retl
 
-  %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
 
-  call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x i32>%a
 }
 
@@ -384,8 +384,8 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
 
   %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
   %imask = bitcast i8 %mask to <8 x i1>
-  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
-  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
+  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
+  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
   %res = add <8 x i32> %gt1, %gt2
   ret <8 x i32> %res
 }
@@ -444,8 +444,8 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
 ; SKX_32-NEXT:    retl
 
   %imask = bitcast i16 %mask to <16 x i1>
-  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
-  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
+  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
+  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
   %res = add <16 x i32> %gt1, %gt2
   ret <16 x i32> %res
 }
@@ -522,7 +522,7 @@ entry:
   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
 
   %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
-  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %res = call <8 x i32 >  @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %res
 }
 
@@ -591,7 +591,7 @@ entry:
   %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
 
   %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
-  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %res = call <8 x i32 >  @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %res
 }
 
@@ -632,7 +632,7 @@ define <16 x float> @test11(float* %base, i32 %ind) {
 
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -671,7 +671,7 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -710,7 +710,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
@@ -772,13 +772,13 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
-declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
-declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
-declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
 
 ; Gather smaller than existing instruction
 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
@@ -831,7 +831,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
 
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
   %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
-  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
   ret <4 x float>%res
 }
 
@@ -890,7 +890,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
 
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
   %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
-  %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
+  %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
   ret <4 x double>%res
 }
 
@@ -942,15 +942,15 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
 
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
-  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
   ret <2 x double>%res
 }
 
-declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
-declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
-declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
-declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
-declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
 
 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ;
@@ -995,7 +995,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; SKX_32-NEXT:    vptestmd %xmm2, %xmm2, %k1
 ; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
   ret void
 }
 
@@ -1049,7 +1049,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   %gep = getelementptr double, double* %ptr, <4 x i64> %ind
-  call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
+  call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
   ret void
 }
 
@@ -1103,7 +1103,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
 ; SKX_32-NEXT:    vscatterdps %xmm0, (,%xmm1) {%k1}
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
+  call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
   ret void
 }
 
@@ -1157,12 +1157,12 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
   ret void
 }
 
 ; The result type requires widening
-declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
 
 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
 ;
@@ -1222,12 +1222,12 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
-  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
+  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
   ret <2 x float>%res
 }
 
-declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
-declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
 
 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
 ;
@@ -1276,7 +1276,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
-  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
+  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
   ret <2 x i32>%res
 }
 
@@ -1320,7 +1320,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
-  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
   ret <2 x i32>%res
 }
 
@@ -1371,7 +1371,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
-  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
+  %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
   ret <2 x i64>%res
 }
 
@@ -1418,7 +1418,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
-  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
+  %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
   ret <2 x i64>%res
 }
 
@@ -1466,7 +1466,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
-  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
+  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
   ret <2 x float>%res
 }
 
@@ -1515,7 +1515,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
   ret void
 }
 
@@ -1568,23 +1568,23 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
 
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
   ret <16 x float>%res
 }
 
 ; Check non-power-of-2 case. It should be scalarized.
-declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
+declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
 ; ALL-LABEL: test30:
 ; ALL-NOT:       gather
 
   %sext_ind = sext <3 x i32> %ind to <3 x i64>
   %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
-  %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
+  %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
   ret <3 x i32>%res
 }
 
-declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
+declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
 
 ; KNL-LABEL: test31
 ; KNL: vpgatherqq
@@ -1626,7 +1626,7 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
 ; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; SKX_32-NEXT:    retl
 
-  %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
+  %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
   ret <16 x float*>%res
 }
 
@@ -1672,7 +1672,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
 ; SKX_32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; SKX_32-NEXT:    retl
-  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
   ret <16 x i32> %res
 }
 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
@@ -1749,10 +1749,10 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    retl
-  %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+  %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
   ret <16 x i64> %res
 }
-declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
 ; KNL_64-LABEL: test_gather_16f32:
 ; KNL_64:       # BB#0:
@@ -1795,7 +1795,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
 ; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
 ; SKX_32-NEXT:    retl
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
   ret <16 x float> %res
 }
 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
@@ -1872,10 +1872,10 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    retl
-  %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+  %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
   ret <16 x double> %res
 }
-declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
 ; KNL_64-LABEL: test_scatter_16i32:
 ; KNL_64:       # BB#0:
@@ -1918,7 +1918,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
 }
 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
@@ -1993,10 +1993,10 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
 ; KNL_64-LABEL: test_scatter_16f32:
 ; KNL_64:       # BB#0:
@@ -2039,10 +2039,10 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
 ; KNL_64-LABEL: test_scatter_16f64:
 ; KNL_64:       # BB#0:
@@ -2115,10 +2115,10 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
 
 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
 ; KNL_64-LABEL: test_pr28312:
@@ -2193,11 +2193,11 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    retl
-  %g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
-  %g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
-  %g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
+  %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
+  %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
+  %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
   %a = add <4 x i64> %g1, %g2
   %b = add <4 x i64> %a, %g3
   ret <4 x i64> %b
 }
-declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
diff --git a/test/CodeGen/X86/stack-folding-lwp.ll b/test/CodeGen/X86/stack-folding-lwp.ll
new file mode 100644
index 000000000000..edf2798ff846
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-lwp.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+lwp < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i8 @stack_fold_lwpins_u32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: stack_fold_lwpins_u32
+; CHECK:       # BB#0:
+; CHECK:       lwpins $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 2814)
+  ret i8 %2
+}
+declare i8 @llvm.x86.lwpins32(i32, i32, i32)
+
+define i8 @stack_fold_lwpins_u64(i64 %a0, i32 %a1) {
+; CHECK-LABEL: stack_fold_lwpins_u64
+; CHECK:       # BB#0:
+; CHECK:       lwpins $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2814)
+  ret i8 %2
+}
+declare i8 @llvm.x86.lwpins64(i64, i32, i32)
+
+define void @stack_fold_lwpval_u32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: stack_fold_lwpval_u32
+; CHECK:       # BB#0:
+; CHECK:       lwpval $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 2814)
+  ret void
+}
+declare void @llvm.x86.lwpval32(i32, i32, i32)
+
+define void @stack_fold_lwpval_u64(i64 %a0, i32 %a1) {
+; CHECK-LABEL: stack_fold_lwpval_u64
+; CHECK:       # BB#0:
+; CHECK:       lwpval $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 2814)
+  ret void
+}
+declare void @llvm.x86.lwpval64(i64, i32, i32)
diff --git a/test/CodeGen/X86/version_directive.ll b/test/CodeGen/X86/version_directive.ll
index 8e4e6dc70e61..ac5eda71dbc6 100644
--- a/test/CodeGen/X86/version_directive.ll
+++ b/test/CodeGen/X86/version_directive.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple x86_64-apple-darwin15.0.0 -o - /dev/null | FileCheck %s
 ; RUN: llc -mtriple x86_64-apple-macosx10.11.0 -o - /dev/null | FileCheck %s
+; RUN: llc -mtriple x86_64-apple-macos10.11.0 -o - /dev/null | FileCheck %s
 
 ; CHECK: .macosx_version_min 10, 11
diff --git a/test/CodeGen/X86/x86-32-intrcc.ll b/test/CodeGen/X86/x86-32-intrcc.ll
index 9794f2cb3e46..ac0e7e11e0e8 100644
--- a/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/test/CodeGen/X86/x86-32-intrcc.ll
@@ -57,23 +57,23 @@ define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i32 %eco
 define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i32 %ecode) {
   call void asm sideeffect "", "~{eax},~{ebx},~{ebp}"()
   ; CHECK-LABEL: test_isr_clobbers
-  ; CHECK-SSE-NEXT: pushl %ebp
-  ; CHECK-SSE-NEXT: pushl %ebx
-  ; CHECK-SSE-NEXT; pushl %eax
-  ; CHECK-SSE-NEXT: popl %eax
-  ; CHECK-SSE-NEXT: popl %ebx
-  ; CHECK-SSE-NEXT: popl %ebp
-  ; CHECK-SSE-NEXT: addl $4, %esp
-  ; CHECK-SSE-NEXT: iretl
+  ; CHECK: pushl %ebp
+  ; CHECK: pushl %ebx
+  ; CHECK: pushl %eax
+  ; CHECK: popl %eax
+  ; CHECK: popl %ebx
+  ; CHECK: popl %ebp
+  ; CHECK: addl $4, %esp
+  ; CHECK: iretl
   ; CHECK0-LABEL: test_isr_clobbers
-  ; CHECK0-SSE-NEXT: pushl %ebp
-  ; CHECK0-SSE-NEXT: pushl %ebx
-  ; CHECK0-SSE-NEXT; pushl %eax
-  ; CHECK0-SSE-NEXT: popl %eax
-  ; CHECK0-SSE-NEXT: popl %ebx
-  ; CHECK0-SSE-NEXT: popl %ebp
-  ; CHECK0-SSE-NEXT: addl $4, %esp
-  ; CHECK0-SSE-NEXT: iretl
+  ; CHECK0: pushl %ebp
+  ; CHECK0: pushl %ebx
+  ; CHECK0: pushl %eax
+  ; CHECK0: popl %eax
+  ; CHECK0: popl %ebx
+  ; CHECK0: popl %ebp
+  ; CHECK0: addl $4, %esp
+  ; CHECK0: iretl
   ret void
 }
 
diff --git a/test/CodeGen/X86/x86-64-intrcc.ll b/test/CodeGen/X86/x86-64-intrcc.ll
index c8bc9e716ce5..75ca1af79b31 100644
--- a/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/test/CodeGen/X86/x86-64-intrcc.ll
@@ -59,32 +59,33 @@ define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i64 %eco
 define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %ecode) {
   call void asm sideeffect "", "~{rax},~{rbx},~{rbp},~{r11},~{xmm0}"()
   ; CHECK-LABEL: test_isr_clobbers
-  ; CHECK-SSE-NEXT: pushq %rax
-  ; CHECK-SSE-NEXT: pushq %rax
-  ; CHECK-SSE-NEXT; pushq %r11
-  ; CHECK-SSE-NEXT: pushq %rbp
-  ; CHECK-SSE-NEXT: pushq %rbx
-  ; CHECK-SSE-NEXT: movaps %xmm0
-  ; CHECK-SSE-NEXT: movaps %xmm0
-  ; CHECK-SSE-NEXT: popq %rbx
-  ; CHECK-SSE-NEXT: popq %rbp
-  ; CHECK-SSE-NEXT: popq %r11
-  ; CHECK-SSE-NEXT: popq %rax
-  ; CHECK-SSE-NEXT: addq $8, %rsp
-  ; CHECK-SSE-NEXT: iretq
+
+  ; CHECK: pushq %rax
+  ; CHECK: pushq %rbp
+  ; CHECK: pushq %r11
+  ; CHECK: pushq %rbx
+  ; CHECK: movaps %xmm0
+  ; CHECK: movaps {{.*}}, %xmm0
+  ; CHECK: popq %rbx
+  ; CHECK: popq %r11
+  ; CHECK: popq %rbp
+  ; CHECK: popq %rax
+  ; CHECK: addq $16, %rsp
+  ; CHECK: iretq
   ; CHECK0-LABEL: test_isr_clobbers
-  ; CHECK0-SSE-NEXT: pushq %rax
-  ; CHECK0-SSE-NEXT; pushq %r11
-  ; CHECK0-SSE-NEXT: pushq %rbp
-  ; CHECK0-SSE-NEXT: pushq %rbx
-  ; CHECK0-SSE-NEXT: movaps %xmm0
-  ; CHECK0-SSE-NEXT: movaps %xmm0
-  ; CHECK0-SSE-NEXT: popq %rbx
-  ; CHECK0-SSE-NEXT: popq %rbp
-  ; CHECK0-SSE-NEXT: popq %r11
-  ; CHECK0-SSE-NEXT: popq %rax
-  ; CHECK0-SSE-NEXT: addq $16, %rsp
-  ; CHECK0-SSE-NEXT: iretq
+
+  ; CHECK0: pushq %rax
+  ; CHECK0: pushq %rbp
+  ; CHECK0: pushq %r11
+  ; CHECK0: pushq %rbx
+  ; CHECK0: movaps %xmm0
+  ; CHECK0: movaps {{.*}}, %xmm0
+  ; CHECK0: popq %rbx
+  ; CHECK0: popq %r11
+  ; CHECK0: popq %rbp
+  ; CHECK0: popq %rax
+  ; CHECK0: addq $16, %rsp
+  ; CHECK0: iretq
   ret void
 }
 
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
new file mode 100644
index 000000000000..7e370c25e31b
--- /dev/null
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py for function "bar"
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; be preserved except for registers used for passing/returning arguments.
+;; In the following function registers %RDI, %RSI and %XMM0 are used to store
+;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %RAX.
+;; The above registers should not be preserved, however other registers
+;; (that are modified by the function) should be preserved (%RDX and %XMM1).
+define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
+; CHECK-LABEL: bar:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushq %rdx
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_offset %rdx, -16
+; CHECK-NEXT:  .Lcfi2:
+; CHECK-NEXT:    .cfi_offset %xmm1, -32
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    popq %rdx
+; CHECK-NEXT:    retq
+  call void asm sideeffect "", "~{rax},~{rdx},~{xmm1},~{rdi},~{rsi},~{xmm0}"()
+  ret i32 4
+}
+
+;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
+;; doesn't need to preserve registers except for the arguments passed 
+;; to "bar" (%ESI, %EDI and %XMM0).
+define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) {
+; CHECK-LABEL: foo
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movl  %esi, %ecx
+; CHECK-NEXT:  movl  %edi, %edx
+; CHECK-NEXT:  callq bar
+; CHECK-NEXT:  addl  %edx, %eax
+; CHECK-NEXT:  addl  %ecx, %eax
+; CHECK-NEXT:  xorps %xmm0, %xmm0
+; CHECK-NEXT:  cvtsi2ssl %eax, %xmm0
+; CHECK-NEXT:  addss %xmm0, %xmm1
+; CHECK:       retq
+	%call = call i32 @bar(i32 %a0, i32 %a1, float %b0) #0
+	%c0   = add i32 %a0, %call
+	%c1   = add i32 %c0, %a1
+	%c2 = sitofp i32 %c1 to float
+	%c3 = fadd float %c2, %b0
+	ret float %c3
+}
+
+attributes #0 = { "no_caller_saved_registers" }
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers.ll b/test/CodeGen/X86/x86-no_caller_saved_registers.ll
new file mode 100644
index 000000000000..9c62e3ee6ba7
--- /dev/null
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; be preserved except for registers used for passing/returning arguments.
+;; The test checks that function "bar" preserves xmm0 register.
+;; It also checks that caller function "foo" does not store registers for callee 
+;; "bar". For example, there is no store/load/access to xmm registers.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 {
+; CHECK-LABEL: bar
+; CHECK:       mov{{.*}}  %xmm0
+; CHECK:       mov{{.*}} {{.*}}, %xmm0
+; CHECK:       ret
+  call void asm sideeffect "", "~{xmm0}"()
+  ret i32 1
+}
+
+define x86_intrcc void @foo(i8* nocapture readnone %c) {
+; CHECK-LABEL: foo
+; CHECK-NOT: xmm
+entry:
+  tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0
+  ret void
+}
+
+attributes #0 = { "no_caller_saved_registers" }
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
index 447813419e3e..21c1eacd0714 100644
--- a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.s b/test/DebugInfo/Inputs/dwarfdump-header.s
index ce51e987f38a..c5cf48597765 100644
--- a/test/DebugInfo/Inputs/dwarfdump-header.s
+++ b/test/DebugInfo/Inputs/dwarfdump-header.s
@@ -1,5 +1,6 @@
-# Test object to verify dwarfdump handles v4 and v5 CU/TU headers.
+# Test object to verify dwarfdump handles v4 and v5 CU/TU/line headers.
 # We have a representative set of units: v4 CU, v5 CU, v4 TU, v5 split TU.
+# We have v4 and v5 line-table headers.
 #
 # To generate the test object:
 # llvm-mc -triple x86_64-unknown-linux dwarfdump-header.s -filetype=obj \
@@ -28,6 +29,8 @@ dwo_TU_5:
         .byte 0x0e  # DW_FORM_strp
         .byte 0x03  # DW_AT_name
         .byte 0x0e  # DW_FORM_strp
+        .byte 0x10  # DW_AT_stmt_list
+        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x02  # Abbrev code
@@ -81,10 +84,11 @@ CU_4_version:
         .short 4               # DWARF version number
         .long .debug_abbrev    # Offset Into Abbrev. Section
         .byte 8                # Address Size (in bytes)
-# The compile-unit DIE, which has just DW_AT_producer and DW_AT_name.
+# The compile-unit DIE, with DW_AT_producer, DW_AT_name, DW_AT_stmt_list.
         .byte 1
         .long str_producer
         .long str_CU_4
+        .long LH_4_start
         .byte 0 # NULL
 CU_4_end:
 
@@ -95,10 +99,11 @@ CU_5_version:
         .byte 1                # DWARF Unit Type
         .byte 8                # Address Size (in bytes)
         .long .debug_abbrev    # Offset Into Abbrev. Section
-# The compile-unit DIE, which has just DW_AT_producer and DW_AT_name.
+# The compile-unit DIE, with DW_AT_producer, DW_AT_name, DW_AT_stmt_list.
         .byte 1
         .long str_producer
         .long str_CU_5
+        .long LH_5_start
         .byte 0 # NULL
 CU_5_end:
 
@@ -147,3 +152,106 @@ TU_split_5_type:
         .byte 0 # NULL
         .byte 0 # NULL
 TU_split_5_end:
+
+        .section .debug_line,"",@progbits
+# DWARF v4 line-table header.
+LH_4_start:
+        .long   LH_4_end-LH_4_version   # Length of Unit
+LH_4_version:
+        .short  4               # DWARF version number
+        .long   LH_4_header_end-LH_4_params     # Length of Prologue
+LH_4_params:
+        .byte   1               # Minimum Instruction Length
+        .byte   1               # Maximum Operations per Instruction
+        .byte   1               # Default is_stmt
+        .byte   -5              # Line Base
+        .byte   14              # Line Range
+        .byte   13              # Opcode Base
+        .byte   0               # Standard Opcode Lengths
+        .byte   1
+        .byte   1
+        .byte   1
+        .byte   1
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   1
+        .byte   0
+        .byte   0
+        .byte   1
+        # Directory table
+        .asciz  "Directory4a"
+        .asciz  "Directory4b"
+        .byte   0
+        # File table
+        .asciz  "File4a"        # File name 1
+        .byte   1               # Directory index 1
+        .byte   0x41            # Timestamp 1
+        .byte   0x42            # File Size 1
+        .asciz  "File4b"        # File name 2
+        .byte   0               # Directory index 2
+        .byte   0x43            # Timestamp 2
+        .byte   0x44            # File Size 2
+        .byte   0               # End of list
+LH_4_header_end:
+        # Line number program, which is empty.
+LH_4_end:
+
+# DWARF v5 line-table header.
+LH_5_start:
+        .long   LH_5_end-LH_5_version   # Length of Unit
+LH_5_version:
+        .short  5               # DWARF version number
+        .byte   8               # Address Size
+        .byte   0               # Segment Selector Size
+        .long   LH_5_header_end-LH_5_params     # Length of Prologue
+LH_5_params:
+        .byte   1               # Minimum Instruction Length
+        .byte   1               # Maximum Operations per Instruction
+        .byte   1               # Default is_stmt
+        .byte   -5              # Line Base
+        .byte   14              # Line Range
+        .byte   13              # Opcode Base
+        .byte   0               # Standard Opcode Lengths
+        .byte   1
+        .byte   1
+        .byte   1
+        .byte   1
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   1
+        .byte   0
+        .byte   0
+        .byte   1
+        # Directory table format
+        .byte   1               # One element per directory entry
+        .byte   1               # DW_LNCT_path
+        .byte   0x08            # DW_FORM_string
+        # Directory table entries
+        .byte   2               # Two directories
+        .asciz "Directory5a"
+        .asciz "Directory5b"
+        # File table format
+        .byte   4               # Four elements per file entry
+        .byte   1               # DW_LNCT_path
+        .byte   0x08            # DW_FORM_string
+        .byte   2               # DW_LNCT_directory_index
+        .byte   0x0b            # DW_FORM_data1
+        .byte   3               # DW_LNCT_timestamp
+        .byte   0x0f            # DW_FORM_udata
+        .byte   4               # DW_LNCT_size
+        .byte   0x0f            # DW_FORM_udata
+        # File table entries
+        .byte   2               # Two files
+        .asciz "File5a"
+        .byte   1
+        .byte   0x51
+        .byte   0x52
+        .asciz "File5b"
+        .byte   2
+        .byte   0x53
+        .byte   0x54
+LH_5_header_end:
+        # Line number program, which is empty.
+LH_5_end:
diff --git a/test/DebugInfo/dwarfdump-header.test b/test/DebugInfo/dwarfdump-header.test
index 3947c8b438d2..222e506dac37 100644
--- a/test/DebugInfo/dwarfdump-header.test
+++ b/test/DebugInfo/dwarfdump-header.test
@@ -7,13 +7,13 @@ CHECK-LABEL: .debug_info contents:
 
 The v4 CU header.
 
-CHECK: 0x00000000: Compile Unit: length = 0x00000011 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000015)
+CHECK: 0x00000000: Compile Unit: length = 0x00000015 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000019)
 CHECK: 0x0000000b: DW_TAG_compile_unit
 
 The v5 normal CU header.
 
-CHECK: 0x00000015: Compile Unit: length = 0x00000012 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000002b)
-CHECK: 0x00000021: DW_TAG_compile_unit
+CHECK: 0x00000019: Compile Unit: length = 0x00000016 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000033)
+CHECK: 0x00000025: DW_TAG_compile_unit
 
 CHECK-LABEL: .debug_types contents:
 
@@ -27,3 +27,33 @@ CHECK: .debug_types.dwo contents:
 
 CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
 CHECK: 0x00000018: DW_TAG_type_unit
+
+CHECK-LABEL: .debug_line contents:
+
+The v4 line table header.
+
+CHECK: Line table prologue:
+CHECK: version: 4
+CHECK-NOT: address_size
+CHECK-NOT: seg_select_size
+CHECK: max_ops_per_inst: 1
+CHECK: include_directories[  1] = 'Directory4a'
+CHECK: include_directories[  2] = 'Directory4b'
+CHECK-NOT: include_directories
+CHECK: file_names[  1]    1 0x00000041 0x00000042 File4a{{$}}
+CHECK: file_names[  2]    0 0x00000043 0x00000044 File4b{{$}}
+CHECK-NOT: file_names
+
+The v5 line table header.
+
+CHECK: Line table prologue:
+CHECK: version: 5
+CHECK: address_size: 8
+CHECK: seg_select_size: 0
+CHECK: max_ops_per_inst: 1
+CHECK: include_directories[  1] = 'Directory5a'
+CHECK: include_directories[  2] = 'Directory5b'
+CHECK-NOT: include_directories
+CHECK: file_names[  1]    1 0x00000051 0x00000052 File5a{{$}}
+CHECK: file_names[  2]    2 0x00000053 0x00000054 File5b{{$}}
+CHECK-NOT: file_names
diff --git a/test/Feature/intrinsics.ll b/test/Feature/intrinsics.ll
index 278cb9564e62..bbf30d3cc231 100644
--- a/test/Feature/intrinsics.ll
+++ b/test/Feature/intrinsics.ll
@@ -69,5 +69,5 @@ define void @trap() {
   ret void
 }
 
-; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes #0 = { nounwind readnone speculatable }
 ; CHECK: attributes #1 = { noreturn nounwind }
diff --git a/test/MC/AArch64/arm32-large-relocs.s b/test/MC/AArch64/arm32-large-relocs.s
deleted file mode 100644
index 1ac86c0871a6..000000000000
--- a/test/MC/AArch64/arm32-large-relocs.s
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: llvm-mc -target-abi=ilp32 -triple=arm64-linux-gnu -show-encoding -o - \
-// RUN:   %s \
-// RUN:   | FileCheck %s
-// RUN: llvm-mc -target-abi=ilp32 -triple=arm64-linux-gnu -show-encoding \
-// RUN:   -filetype=obj -o - %s \
-// RUN:   | llvm-objdump -r - \
-// RUN:   | FileCheck --check-prefix=CHECK-OBJ %s
-
-        movz x2, #:abs_g0:sym
-        movk w3, #:abs_g0_nc:sym
-        movz x13, #:abs_g0_s:sym
-        movn x17, #:abs_g0_s:sym
-// CHECK:   movz x2, #:abs_g0:sym // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
-// CHECK-NEXT: // fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
-// CHECK:   movk w3, #:abs_g0_nc:sym // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
-// CHECK-NEXT: // fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
-// CHECK:   movz x13, #:abs_g0_s:sym // encoding: [0bAAA01101,A,0b100AAAAA,0xd2]
-// CHECK-NEXT: // fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
-// CHECK:   movn x17, #:abs_g0_s:sym // encoding: [0bAAA10001,A,0b100AAAAA,0x92]
-// CHECK-NEXT: // fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
-
-// CHECK-OBJ: 0 R_AARCH64_P32_MOVW_UABS_G0 sym
-// CHECK-OBJ: 4 R_AARCH64_P32_MOVW_UABS_G0_NC sym
-// CHECK-OBJ: 8 R_AARCH64_P32_MOVW_SABS_G0 sym
-// CHECK-OBJ: c R_AARCH64_P32_MOVW_SABS_G0 sym
-
-        movz x4, #:abs_g1:sym
-// CHECK:   movz x4, #:abs_g1:sym    // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
-// CHECK-NEXT: // fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
-
-// CHECK-OBJ: 10 R_AARCH64_P32_MOVW_UABS_G1 sym
diff --git a/test/MC/AArch64/arm32-tls-relocs.s b/test/MC/AArch64/arm32-tls-relocs.s
deleted file mode 100644
index 390da05529fa..000000000000
--- a/test/MC/AArch64/arm32-tls-relocs.s
+++ /dev/null
@@ -1,290 +0,0 @@
-// RUN: llvm-mc -target-abi=ilp32 -triple=arm64-none-linux-gnu \
-// RUN:   -show-encoding < %s | FileCheck --check-prefix=CHECK-ILP32 %s
-// RUN: llvm-mc -target-abi=ilp32 -triple=arm64-none-linux-gnu \
-// RUN:   -filetype=obj < %s -o - | \
-// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF-ILP32 %s
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS initial-exec forms
-////////////////////////////////////////////////////////////////////////////////
-
-        adrp x11, :gottprel:var
-        ldr w10, [x0, #:gottprel_lo12:var]
-        ldr w9, :gottprel:var
-// CHECK-ILP32: adrp x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
-// CHECK-ILP32: ldr  w10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xb9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
-// CHECK-ILP32: ldr     w9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x18]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
-
-// CHECK-ELF-ILP32:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM:[^ ]+]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSIE_LD_GOTTPREL_PREL19 [[VARSYM]]
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS local-exec forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x5, #:tprel_g1:var
-        movn x6, #:tprel_g1:var
-        movz w7, #:tprel_g1:var
-// CHECK-ILP32: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
-
-// CHECK-ELF-ILP32: {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-// CHECK-ELF-ILP32: {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-// CHECK-ELF-ILP32: {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-
-
-        movz x11, #:tprel_g0:var
-        movn x12, #:tprel_g0:var
-        movz w13, #:tprel_g0:var
-// CHECK-ILP32: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-
-
-        movk w15, #:tprel_g0_nc:var
-        movk w16, #:tprel_g0_nc:var
-// CHECK-ILP32: movk    w15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0x72]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
-
-
-        add x21, x22, #:tprel_lo12:var
-// CHECK-ILP32: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
-
-
-        add x25, x26, #:tprel_lo12_nc:var
-// CHECK-ILP32: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
-
-
-        ldrb w29, [x30, #:tprel_lo12:var]
-        ldrsb x29, [x28, #:tprel_lo12_nc:var]
-// CHECK-ILP32: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
-// CHECK-ILP32: ldrsb   x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
-
-
-        strh w27, [x26, #:tprel_lo12:var]
-        ldrsh x25, [x24, #:tprel_lo12_nc:var]
-// CHECK-ILP32: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
-// CHECK-ILP32: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
-
-
-        ldr w23, [x22, #:tprel_lo12:var]
-        ldrsw x21, [x20, #:tprel_lo12_nc:var]
-// CHECK-ILP32: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
-// CHECK-ILP32: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
-
-        ldr x19, [x18, #:tprel_lo12:var]
-        str x17, [x16, #:tprel_lo12_nc:var]
-// CHECK-ILP32: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
-// CHECK-ILP32: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
-
-
-   ldr q24, [x23, :tprel_lo12:var]
-   str q22, [x21, :tprel_lo12_nc:var]
-// CHECK-ILP32: ldr     q24, [x23, :tprel_lo12:var] // encoding: [0xf8,0bAAAAAA10,0b11AAAAAA,0x3d]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale16
-// CHECK-ILP32: str     q22, [x21, :tprel_lo12_nc:var] // encoding: [0xb6,0bAAAAAA10,0b10AAAAAA,0x3d]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale16
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST128_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLE_LDST128_TPREL_LO12_NC [[VARSYM]]
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS local-dynamic forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x5, #:dtprel_g1:var
-        movn x6, #:dtprel_g1:var
-        movz w7, #:dtprel_g1:var
-// CHECK-ILP32: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-
-
-        movz x11, #:dtprel_g0:var
-        movn x12, #:dtprel_g0:var
-        movz w13, #:dtprel_g0:var
-// CHECK-ILP32: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-
-
-        movk x15, #:dtprel_g0_nc:var
-        movk w16, #:dtprel_g0_nc:var
-// CHECK-ILP32: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
-// CHECK-ILP32: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
-
-
-        add x21, x22, #:dtprel_lo12:var
-// CHECK-ILP32: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
-
-
-        add x25, x26, #:dtprel_lo12_nc:var
-// CHECK-ILP32: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
-
-
-	add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
-	add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
-
-// CHECK-ELF-ILP32: R_AARCH64_P32_TLSLD_ADD_DTPREL_HI12 var_tlsld
-// CHECK-ELF-ILP32: R_AARCH64_P32_TLSLE_ADD_TPREL_HI12 var_tlsle
-
-
-        ldrb w29, [x30, #:dtprel_lo12:var]
-        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
-// CHECK-ILP32: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
-// CHECK-ILP32: ldrsb   x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
-
-
-        strh w27, [x26, #:dtprel_lo12:var]
-        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
-// CHECK-ILP32: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
-// CHECK-ILP32: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
-
-
-        ldr w23, [x22, #:dtprel_lo12:var]
-        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
-// CHECK-ILP32: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
-// CHECK-ILP32: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
-
-        ldr x19, [x18, #:dtprel_lo12:var]
-        str x17, [x16, #:dtprel_lo12_nc:var]
-// CHECK-ILP32: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
-// CHECK-ILP32: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
-
-        ldr q24, [x23, #:dtprel_lo12:var]
-        str q22, [x21, #:dtprel_lo12_nc:var]
-// CHECK-ILP32: ldr     q24, [x23, :dtprel_lo12:var] // encoding: [0xf8,0bAAAAAA10,0b11AAAAAA,0x3d]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale16
-// CHECK-ILP32: str     q22, [x21, :dtprel_lo12_nc:var] // encoding: [0xb6,0bAAAAAA10,0b10AAAAAA,0x3d]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale16
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST128_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSLD_LDST128_DTPREL_LO12_NC [[VARSYM]]
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS descriptor forms
-////////////////////////////////////////////////////////////////////////////////
-
-        adrp x8, :tlsdesc:var
-        ldr w7, [x6, #:tlsdesc_lo12:var]
-        add x5, x4, #:tlsdesc_lo12:var
-        .tlsdesccall var
-        blr x3
-
-// CHECK-ILP32: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
-// CHECK-ILP32: ldr     w7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xb9]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
-// CHECK-ILP32: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
-// CHECK-ILP32: .tlsdesccall var                // encoding: []
-// CHECK-ILP32-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
-// CHECK-ILP32: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
-
-
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSDESC_ADR_PAGE21 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSDESC_LD32_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSDESC_ADD_LO12 [[VARSYM]]
-// CHECK-ELF-ILP32-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_P32_TLSDESC_CALL [[VARSYM]]
-
-        // Make sure symbol 5 has type STT_TLS:
-
-// CHECK-ELF-ILP32:      Symbols [
-// CHECK-ELF-ILP32:        Symbol {
-// CHECK-ELF-ILP32:          Name: var
-// CHECK-ELF-ILP32-NEXT:     Value:
-// CHECK-ELF-ILP32-NEXT:     Size:
-// CHECK-ELF-ILP32-NEXT:     Binding: Global
-// CHECK-ELF-ILP32-NEXT:     Type: TLS
diff --git a/test/MC/AArch64/elf-reloc-pcreladdressing-ilp32.s b/test/MC/AArch64/elf-reloc-pcreladdressing-ilp32.s
deleted file mode 100644
index c08192e7e0d6..000000000000
--- a/test/MC/AArch64/elf-reloc-pcreladdressing-ilp32.s
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: llvm-mc -target-abi=ilp32 -triple=aarch64-none-linux-gnu \
-// RUN:   -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ-ILP32 %s
-        adr x2, some_label
-        adrp x5, some_label
-
-        adrp x5, :got:some_label
-        ldr w0, [x5, #:got_lo12:some_label]
-
-// OBJ-ILP32:      Relocations [
-// OBJ-ILP32-NEXT:   Section {{.*}} .rela.text {
-// OBJ-ILP32-NEXT:     0x0 R_AARCH64_P32_ADR_PREL_LO21    some_label 0x0
-// OBJ-ILP32-NEXT:     0x4 R_AARCH64_P32_ADR_PREL_PG_HI21 some_label 0x0
-// OBJ-ILP32-NEXT:     0x8 R_AARCH64_P32_ADR_GOT_PAGE     some_label 0x0
-// OBJ-ILP32-NEXT:     0xC R_AARCH64_P32_LD32_GOT_LO12_NC some_label 0x0
-// OBJ-ILP32-NEXT:   }
-// OBJ-ILP32-NEXT: ]
diff --git a/test/MC/AArch64/lp64-diagnostics.s b/test/MC/AArch64/lp64-diagnostics.s
deleted file mode 100644
index 942923ffccb1..000000000000
--- a/test/MC/AArch64/lp64-diagnostics.s
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t2 -filetype=obj \
-// RUN:   >/dev/null
-// RUN: FileCheck --check-prefix=CHECK-ERROR %s < %t2
-
-   ldr w24, [x23, :tlsdesc_lo12:sym]
-   ldr s22, [x21, :tlsdesc_lo12:sym]
-
-// CHECK-ERROR: error: LP64 4 byte TLSDESC load/store relocation not supported (ILP32 eqv: TLSDESC_LD64_LO12)
-// CHECK-ERROR:   ldr w24, [x23, :tlsdesc_lo12:sym]
-// CHECK-ERROR:   ^
-// CHECK-ERROR: error: LP64 4 byte TLSDESC load/store relocation not supported (ILP32 eqv: TLSDESC_LD64_LO12)
-// CHECK-ERROR:   ldr s22, [x21, :tlsdesc_lo12:sym]
-// CHECK-ERROR:   ^
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 9dd49e51d91b..1b865d37bf0b 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -773,3 +773,21 @@
 
 #CHECK: getsec
 0x0f 0x37
+
+#CHECK: llwpcb %ecx
+0x8f 0xe9 0x78 0x12 0xc1
+
+#CHECK: slwpcb %ecx
+0x8f 0xe9 0x78 0x12 0xc9
+
+# CHECK: lwpins $305419896, %ebx, %eax
+0x8f 0xea 0x78 0x12 0xc3 0x78 0x56 0x34 0x12
+
+# CHECK: lwpins $591751049, (%esp), %edx
+0x8f 0xea 0x68 0x12 0x04 0x24 0x89 0x67 0x45 0x23
+
+# CHECK: lwpval $1737075661, %ebx, %eax
+0x8f 0xea 0x78 0x12 0xcb 0xcd 0xab 0x89 0x67
+
+# CHECK: lwpval $2309737967, (%esp), %edx
+0x8f 0xea 0x68 0x12 0x0c 0x24 0xef 0xcd 0xab 0x89
diff --git a/test/MC/Disassembler/X86/x86-64.txt b/test/MC/Disassembler/X86/x86-64.txt
index 1511347306a7..659ad9051fd5 100644
--- a/test/MC/Disassembler/X86/x86-64.txt
+++ b/test/MC/Disassembler/X86/x86-64.txt
@@ -456,3 +456,27 @@
 
 # CHECK: callq -32769
 0xe8 0xff 0x7f 0xff 0xff
+
+# CHECK: llwpcb %rax
+0x8f 0xe9 0xf8 0x12 0xc0
+
+# CHECK: slwpcb %rax
+0x8f 0xe9 0xf8 0x12 0xc8
+
+# CHECK: lwpins $305419896, %ebx, %rax
+0x8f 0xea 0xf8 0x12 0xc3 0x78 0x56 0x34 0x12
+
+# CHECK: lwpins $591751049, (%rsp), %rdx
+0x8f 0xea 0xe8 0x12 0x04 0x24 0x89 0x67 0x45 0x23
+
+# CHECK: lwpins $591751049, (%esp), %edx
+0x67 0x8f 0xea 0x68 0x12 0x04 0x24 0x89 0x67 0x45 0x23
+
+# CHECK: lwpval $1737075661, %ebx, %rax
+0x8f 0xea 0xf8 0x12 0xcb 0xcd 0xab 0x89 0x67
+
+# CHECK: lwpval $2309737967, (%rsp), %rdx
+0x8f 0xea 0xe8 0x12 0x0c 0x24 0xef 0xcd 0xab 0x89
+
+# CHECK: lwpval $2309737967, (%esp), %edx
+0x67 0x8f 0xea 0x68 0x12 0x0c 0x24 0xef 0xcd 0xab 0x89
diff --git a/test/MC/Hexagon/PacketRules/endloop_branches.s b/test/MC/Hexagon/PacketRules/endloop_branches.s
new file mode 100644
index 000000000000..fbaa246c0684
--- /dev/null
+++ b/test/MC/Hexagon/PacketRules/endloop_branches.s
@@ -0,0 +1,12 @@
+# RUN: not llvm-mc -triple=hexagon -filetype=asm %s 2>&1 | FileCheck %s
+
+# Check that a branch in an end-loop packet is caught.
+
+{ jump unknown
+}:endloop0
+# CHECK: 5:3: error: packet marked with `:endloop0' cannot contain instructions that modify register
+
+{ jump unknown
+}:endloop1
+
+# CHECK: 9:3: error: packet marked with `:endloop1' cannot contain instructions that modify register
diff --git a/test/MC/Hexagon/PacketRules/restrict_ax.s b/test/MC/Hexagon/PacketRules/restrict_ax.s
new file mode 100644
index 000000000000..b8f7a1f782c3
--- /dev/null
+++ b/test/MC/Hexagon/PacketRules/restrict_ax.s
@@ -0,0 +1,4 @@
+{ r0=memw_locked(r0)
+  r1=-mpyi(r0,#0) }
+# RUN: not llvm-mc -arch=hexagon -filetype=asm %s 2>%t; FileCheck %s --check-prefix=CHECK00 <%t
+# CHECK00: 1:3: error: Instruction can only be in a packet with ALU or non-FPU XTYPE instructions
diff --git a/test/MC/Hexagon/dealloc-return-jump.s b/test/MC/Hexagon/dealloc-return-jump.s
new file mode 100644
index 000000000000..0d480bef85d2
--- /dev/null
+++ b/test/MC/Hexagon/dealloc-return-jump.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -arch=hexagon -mcpu=hexagonv62 -filetype=obj -o - %s
+# Check that a duplex involving dealloc_return is correctly checked
+# dealloc_return cannot be involved in a double jump packet
+
+{ r0=add(r0,#-1)
+  p0=cmp.eq(r0,r0); if (p0.new) jump:nt 0
+  if (p0) dealloc_return }
diff --git a/test/MC/Hexagon/endloop.s b/test/MC/Hexagon/endloop.s
deleted file mode 100644
index d537eb00ed05..000000000000
--- a/test/MC/Hexagon/endloop.s
+++ /dev/null
@@ -1,19 +0,0 @@
-# RUN: not llvm-mc -triple=hexagon -filetype=asm %s 2>&1 | FileCheck %s
-
-# Check that a branch in an end-loop packet is caught.
-
-1:
-{
-	r0 = #1
-	p0 = cmp.eq (r1, r2)
-	if (p0) jump 1b
-}:endloop0
-
-2:
-{
-        r0 = #1
-        p0 = cmp.eq (r1, r2)
-        if (p0) jump 2b
-}:endloop1
-
-# CHECK: rror: packet marked with `:endloop{{.}}' cannot contain instructions that modify register
diff --git a/test/MC/Hexagon/iconst.s b/test/MC/Hexagon/iconst.s
index 917cc64ba953..156d1abe8cb4 100644
--- a/test/MC/Hexagon/iconst.s
+++ b/test/MC/Hexagon/iconst.s
@@ -2,5 +2,5 @@
 
 a:
 # CHECK: r0 = add(r0,#0)
-# CHECK: R_HEX_23_REG
+# CHECK: R_HEX_27_REG
 r0 = iconst(#a)
diff --git a/test/MC/Hexagon/plt-rel.s b/test/MC/Hexagon/plt-rel.s
new file mode 100644
index 000000000000..cba3d7e9ef2e
--- /dev/null
+++ b/test/MC/Hexagon/plt-rel.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d -r - | FileCheck %s
+
+call foo@GDPLT
+# CHECK: R_HEX_GD_PLT_B22_PCREL
+call ##foo@GDPLT
+# CHECK:  R_HEX_GD_PLT_B32_PCREL_X
+# CHECK-NEXT: R_HEX_GD_PLT_B22_PCREL_X
+
+call foo@LDPLT
+# CHECK:  R_HEX_LD_PLT_B22_PCREL
+call ##foo@LDPLT
+# CHECK:  R_HEX_LD_PLT_B32_PCREL_X
+# CHECK-NEXT:  R_HEX_LD_PLT_B22_PCREL_X
diff --git a/test/MC/Hexagon/solo-axok.s b/test/MC/Hexagon/solo-axok.s
new file mode 100644
index 000000000000..2df5796e628d
--- /dev/null
+++ b/test/MC/Hexagon/solo-axok.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -arch=hexagon -filetype=asm -mcpu=hexagonv55 %s 2>%t; FileCheck %s < %t
+#
+{
+  sp=asrh(r6)
+  l2fetch(fp,r23:22)
+  p2=r7
+  p1=dfclass(r31:30,#6)
+}
+# CHECK: rror: Instruction can only
diff --git a/test/MC/X86/lwp-x86_64.s b/test/MC/X86/lwp-x86_64.s
new file mode 100644
index 000000000000..92f15967461e
--- /dev/null
+++ b/test/MC/X86/lwp-x86_64.s
@@ -0,0 +1,25 @@
+# RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s --check-prefix=CHECK
+
+llwpcb %rcx
+# CHECK: llwpcb %rcx
+# CHECK: encoding: [0x8f,0xe9,0xf8,0x12,0xc1]
+
+slwpcb %rax
+# CHECK: slwpcb %rax
+# CHECK: encoding: [0x8f,0xe9,0xf8,0x12,0xc8]
+
+lwpins $305419896, %ebx, %rax
+# CHECK: lwpins $305419896, %ebx, %rax
+# CHECK: encoding: [0x8f,0xea,0xf8,0x12,0xc3,0x78,0x56,0x34,0x12]
+
+lwpins $591751049, (%rsp), %rdx
+# CHECK: lwpins $591751049, (%rsp), %rdx
+# CHECK: encoding: [0x8f,0xea,0xe8,0x12,0x04,0x24,0x89,0x67,0x45,0x23]
+
+lwpval $1737075661, %ebx, %rax
+# CHECK: lwpval $1737075661, %ebx, %rax
+# CHECK: encoding: [0x8f,0xea,0xf8,0x12,0xcb,0xcd,0xab,0x89,0x67]
+
+lwpval $2309737967, (%rsp), %rdx
+# CHECK: lwpval $2309737967, (%rsp), %rdx
+# CHECK: encoding: [0x8f,0xea,0xe8,0x12,0x0c,0x24,0xef,0xcd,0xab,0x89]
diff --git a/test/MC/X86/lwp.s b/test/MC/X86/lwp.s
new file mode 100644
index 000000000000..43d6f2cd7e3b
--- /dev/null
+++ b/test/MC/X86/lwp.s
@@ -0,0 +1,32 @@
+# RUN: llvm-mc -triple i686-unknown-unknown --show-encoding %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X86
+# RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X64
+
+llwpcb %ecx
+# CHECK: llwpcb %ecx
+# CHECK-X86: encoding: [0x8f,0xe9,0x78,0x12,0xc1]
+# CHECK-X64: encoding: [0x8f,0xe9,0x78,0x12,0xc1]
+
+slwpcb %eax
+# CHECK: slwpcb %eax
+# CHECK-X86: encoding: [0x8f,0xe9,0x78,0x12,0xc8]
+# CHECK-X64: encoding: [0x8f,0xe9,0x78,0x12,0xc8]
+
+lwpins $305419896, %ebx, %eax
+# CHECK: lwpins $305419896, %ebx, %eax
+# CHECK-X86: encoding: [0x8f,0xea,0x78,0x12,0xc3,0x78,0x56,0x34,0x12]
+# CHECK-X64: encoding: [0x8f,0xea,0x78,0x12,0xc3,0x78,0x56,0x34,0x12]
+
+lwpins $591751049, (%esp), %edx
+# CHECK: lwpins $591751049, (%esp), %edx
+# CHECK-X86: encoding: [0x8f,0xea,0x68,0x12,0x04,0x24,0x89,0x67,0x45,0x23]
+# CHECK-X64: encoding: [0x67,0x8f,0xea,0x68,0x12,0x04,0x24,0x89,0x67,0x45,0x23]
+
+lwpval $1737075661, %ebx, %eax
+# CHECK: lwpval $1737075661, %ebx, %eax
+# CHECK-X86: encoding: [0x8f,0xea,0x78,0x12,0xcb,0xcd,0xab,0x89,0x67]
+# CHECK-X64: encoding: [0x8f,0xea,0x78,0x12,0xcb,0xcd,0xab,0x89,0x67]
+
+lwpval $2309737967, (%esp), %edx
+# CHECK: lwpval $2309737967, (%esp), %edx
+# CHECK-X86: encoding: [0x8f,0xea,0x68,0x12,0x0c,0x24,0xef,0xcd,0xab,0x89]
+# CHECK-X64: encoding: [0x67,0x8f,0xea,0x68,0x12,0x0c,0x24,0xef,0xcd,0xab,0x89]
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index 6b50e2b4f2fc..b7f87fe1db0e 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -503,4 +503,4 @@ define i64 @testcttzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
 ; CHECK: declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) #0
 ; CHECK: declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0
 ; CHECK: declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) #0
-; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes #0 = { nounwind readnone speculatable }
diff --git a/test/Transforms/CodeExtractor/PartialInlineOptRemark.ll b/test/Transforms/CodeExtractor/PartialInlineOptRemark.ll
index c8808182f717..b2442b8b173c 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOptRemark.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOptRemark.ll
@@ -64,6 +64,22 @@ bb2:                                              ; preds = %bb1, %bb
   ret i32 %tmp3, !dbg !19
 }
 
+define i32 @bar_cold(i32 %arg) local_unnamed_addr #3 !dbg !5 {
+bb:
+  %tmp = icmp slt i32 %arg, 0, !dbg !7
+  br i1 %tmp, label %bb1, label %bb2, !dbg !8
+
+bb1:                                              ; preds = %bb
+  tail call void (...) @foo() #0, !dbg !9
+  tail call void (...) @foo() #0, !dbg !10
+  tail call void (...) @foo() #0, !dbg !11
+  br label %bb2, !dbg !18
+
+bb2:                                              ; preds = %bb1, %bb
+  %tmp3 = phi i32 [ 0, %bb1 ], [ 1, %bb ]
+  ret i32 %tmp3, !dbg !19
+}
+
 ; Function Attrs: nounwind
 declare void @foo(...) local_unnamed_addr #0
 
@@ -73,16 +89,19 @@ bb:
 ; CHECK:remark{{.*}}bar partially inlined into dummy_caller
 ; CHECK-NOT:remark{{.*}}bar_noinline partially inlined into dummy_caller
 ; CHECK-NOT:remark{{.*}}bar_alwaysinline partially inlined into dummy_caller
+; CHECK-NOT:remark{{.*}}bar_cold partially inlined into dummy_caller
 ; LIMIT-NOT:remark{{.*}}bar partially inlined into dummy_caller
   %tmp = tail call i32 @bar(i32 %arg), !dbg !21
   %tmp2 = tail call i32 @bar_noinline(i32 %arg), !dbg !21
   %tmp3 = tail call i32 @bar_alwaysinline(i32 %arg), !dbg !21
+  %tmp4 = tail call i32 @bar_cold(i32 %arg), !dbg !21
   ret i32 %tmp, !dbg !22
 }
 
 attributes #0 = { nounwind }
 attributes #1 = { noinline nounwind }
 attributes #2 = { alwaysinline nounwind }
+attributes #3 = { cold nounwind }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3}
diff --git a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index dfa999e1b34f..aec00e81bb48 100644
--- a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -39,7 +39,7 @@ bb2:                                              ; preds = %bb1, %bb
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind ssp }
-; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes #1 = { nounwind readnone speculatable }
 ; CHECK: attributes #2 = { noinline nounwind ssp }
 ; CHECK: attributes [[NUW]] = { nounwind }
 
diff --git a/test/Transforms/FunctionAttrs/readattrs.ll b/test/Transforms/FunctionAttrs/readattrs.ll
index 988557e27152..3728a7179724 100644
--- a/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/test/Transforms/FunctionAttrs/readattrs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -functionattrs -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S | FileCheck %s
 @x = global i32 0
@@ -68,22 +69,22 @@ entry:
 }
 
 ; CHECK: declare void @llvm.masked.scatter
-declare void @llvm.masked.scatter.v4i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>)
 
 ; CHECK-NOT: readnone
 ; CHECK-NOT: readonly
 ; CHECK: define void @test9
 define void @test9(<4 x i32*> %ptrs, <4 x i32>%val) {
-  call void @llvm.masked.scatter.v4i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
   ret void
 }
 
 ; CHECK: declare <4 x i32> @llvm.masked.gather
-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 ; CHECK: readonly
 ; CHECK: define <4 x i32> @test10
 define <4 x i32> @test10(<4 x i32*> %ptrs) {
-  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
   ret <4 x i32> %res
 }
 
diff --git a/test/Transforms/FunctionImport/unnamed-globals.ll b/test/Transforms/FunctionImport/unnamed-globals.ll
deleted file mode 100644
index 167fad28f439..000000000000
--- a/test/Transforms/FunctionImport/unnamed-globals.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; Make sure we don't crash when referencing an unnamed global.
-; RUN: opt %s -module-summary-analysis -S
-
-@0 = external global [1 x { i64 }]
-
-define internal void @tinkywinky() {
-  call void @patatino(i64 ptrtoint ([1 x { i64 }]* @0 to i64), i64 4)
-  ret void
-}
-declare void @patatino(i64, i64)
diff --git a/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll b/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
index 3f8fdcc8eafb..5b10a1bfc79e 100644
--- a/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
+++ b/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
 
-declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
-declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
 
 ; This test ensures that masked scatter and gather operations, which take vectors of pointers,
 ; do not have pointer aliasing ignored when being processed.
@@ -20,18 +20,18 @@ entry:
   %tmp.i = insertelement <2 x i32*> undef, i32* %tmp.0, i32 0
   %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1
   ; Read from in1 and in2
-  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
-  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
   ; Store in1 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
   ; Read in1 from the allocas
   ; This gather should alias the scatter we just saw
-  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
   ; Store in2 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
   ; Read in2 from the allocas
   ; This gather should alias the scatter we just saw, and not be eliminated
-  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
   ; Store in2 to out for good measure
   %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0
   %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index b9e208440581..66ab7f48aeff 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -284,7 +284,7 @@ define i32 @cttz(i32 %a) {
 define i1 @cttz_knownbits(i32 %arg) {
 ; CHECK-LABEL: @cttz_knownbits(
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG:%.*]], 4
-; CHECK-NEXT:    [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true) #0
+; CHECK-NEXT:    [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true)
 ; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[CNT]], 4
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
@@ -307,7 +307,7 @@ define i8 @ctlz(i8 %a) {
 define i1 @ctlz_knownbits(i8 %arg) {
 ; CHECK-LABEL: @ctlz_knownbits(
 ; CHECK-NEXT:    [[OR:%.*]] = or i8 [[ARG:%.*]], 32
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true) #0
+; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true)
 ; CHECK-NEXT:    [[RES:%.*]] = icmp eq i8 [[CNT]], 4
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
diff --git a/test/Transforms/InstCombine/masked_intrinsics.ll b/test/Transforms/InstCombine/masked_intrinsics.ll
index ce79ce56b5cb..d5403d17ddc2 100644
--- a/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -2,8 +2,8 @@
 
 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask)
-declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru)
-declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru)
+declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
 
 define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru)  {
   %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru)
@@ -49,7 +49,7 @@ define void @store_onemask(<2 x double>* %ptr, <2 x double> %val)  {
 }
 
 define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru)  {
-  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 5, <2 x i1> zeroinitializer, <2 x double> %passthru)
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 5, <2 x i1> zeroinitializer, <2 x double> %passthru)
   ret <2 x double> %res
 
 ; CHECK-LABEL: @gather_zeromask(
@@ -57,7 +57,7 @@ define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru
 }
 
 define void @scatter_zeromask(<2 x double*> %ptrs, <2 x double> %val)  {
-  call void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32 6, <2 x i1> zeroinitializer)
+  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 6, <2 x i1> zeroinitializer)
   ret void
 
 ; CHECK-LABEL: @scatter_zeromask(
diff --git a/test/Transforms/InstCombine/pow-sqrt.ll b/test/Transforms/InstCombine/pow-sqrt.ll
index 52175f1b1247..82db192ed801 100644
--- a/test/Transforms/InstCombine/pow-sqrt.ll
+++ b/test/Transforms/InstCombine/pow-sqrt.ll
@@ -6,7 +6,7 @@ define double @pow_half(double %x) {
 }
 
 ; CHECK-LABEL: define double @pow_half(
-; CHECK-NEXT:  %sqrt = call fast double @sqrt(double %x)
+; CHECK-NEXT:  %sqrt = call fast double @sqrt(double %x) #1
 ; CHECK-NEXT:  ret double %sqrt
 
 define double @pow_neghalf(double %x) {
@@ -15,8 +15,11 @@ define double @pow_neghalf(double %x) {
 }
 
 ; CHECK-LABEL: define double @pow_neghalf(
-; CHECK-NEXT: %sqrt = call fast double @sqrt(double %x) #0
+; CHECK-NEXT: %sqrt = call fast double @sqrt(double %x) #1
 ; CHECK-NEXT: %sqrtrecip = fdiv fast double 1.000000e+00, %sqrt
 ; CHECK-NEXT: ret double %sqrtrecip
 
-declare double @llvm.pow.f64(double, double)
+declare double @llvm.pow.f64(double, double) #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/sub-xor.ll b/test/Transforms/InstCombine/sub-xor.ll
index 812305d8e489..adcca8480594 100644
--- a/test/Transforms/InstCombine/sub-xor.ll
+++ b/test/Transforms/InstCombine/sub-xor.ll
@@ -27,7 +27,7 @@ declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 
 define i32 @test2(i32 %x) nounwind {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[COUNT:%.*]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true) #0
+; CHECK-NEXT:    [[COUNT:%.*]] = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
 ; CHECK-NEXT:    [[SUB:%.*]] = xor i32 [[COUNT]], 31
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
diff --git a/test/Transforms/LoopDeletion/unreachable-loops.ll b/test/Transforms/LoopDeletion/unreachable-loops.ll
new file mode 100644
index 000000000000..147a85670121
--- /dev/null
+++ b/test/Transforms/LoopDeletion/unreachable-loops.ll
@@ -0,0 +1,336 @@
+; RUN: opt < %s -loop-deletion -verify-dom-info -S | FileCheck %s
+
+; Checking that we can delete loops that are never executed.
+; We do not change the constant conditional branch statement (where the not-taken target
+; is the loop) to an unconditional one.
+
+; delete the infinite loop because it is never executed.
+define void @test1(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: test1
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-NOT: bb:
+entry:
+  br i1 true, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+; FIXME: We can delete this infinite loop. Currently we do not,
+; because the infinite loop has no exit block.
+define void @test2(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: test2
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-LABEL: bb:
+; CHECK: br label %bb
+entry:
+  br i1 true, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br label %bb
+
+return:
+  ret void
+}
+
+; There are multiple exiting blocks and a single exit block. 
+; Since it is a never executed loop, we do not care about the values
+; from different exiting paths and we can
+; delete the loop.
+define i64 @test3(i64 %n, i64 %m, i64 %maybe_zero) nounwind {
+
+; CHECK-NOT: bb:
+; CHECK-NOT: bb2:
+; CHECK-NOT: bb3:
+; CHECK-LABEL: return.loopexit:
+; CHECK-NEXT: %x.lcssa.ph = phi i64 [ undef, %bb.preheader ]
+; CHECK-NEXT: br label %return
+; CHECK-LABEL: return:
+; CHECK-NEXT: %x.lcssa = phi i64 [ 20, %entry ], [ %x.lcssa.ph, %return.loopexit ]
+; CHECK-NEXT: ret i64 %x.lcssa
+entry:
+  br i1 false, label %bb, label %return
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb3 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  %unused1 = udiv i64 42, %maybe_zero
+  br i1 %t2, label %bb3, label %return
+
+bb3:
+  %t3 = icmp slt i64 %x.0, %m
+  %unused2 = sdiv i64 42, %maybe_zero
+  br i1 %t3, label %bb, label %return
+
+return:
+; the only valid value fo x.lcssa is 20.
+  %x.lcssa = phi i64 [ 12, %bb ], [ 14, %bb2 ], [ 16, %bb3 ], [20, %entry ]
+  ret i64 %x.lcssa
+}
+
+; Cannot delete the loop, since it may be executed at runtime.
+define void @test4(i64 %n, i64 %m, i1 %cond) {
+; CHECK-LABEL: test4
+; CHECK-LABEL: bb:
+entry:
+  br i1 %cond, label %looppred1, label %looppred2
+
+looppred1:
+  br i1 true, label %return, label %bb
+
+looppred2:
+  br i1 false, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %looppred1 ], [ 1, %looppred2 ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+; multiple constant conditional branches with loop not-taken in all cases.
+define void @test5(i64 %n, i64 %m, i1 %cond) nounwind {
+; CHECK-LABEL: test5
+; CHECK-LABEL: looppred1:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-LABEL: looppred2:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-NOT: bb:
+entry:
+  br i1 %cond, label %looppred1, label %looppred2
+
+looppred1:
+  br i1 true, label %return, label %bb
+
+looppred2:
+  br i1 true, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %looppred1 ], [ 1, %looppred2 ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+; Don't delete this infinite loop because the loop 
+; is executable at runtime.
+define void @test6(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: test6
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br i1 true, label %bb.preheader, label %bb.preheader
+; CHECK: bb:
+entry:
+  br i1 true, label %bb, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+declare i64 @foo(i64)
+; The loop L2 is never executed and is a subloop, with an 
+; exit block that branches back to parent loop.
+; Here we can delete loop L2, while L1 still exists.
+define i64 @test7(i64 %n) {
+; CHECK-LABEL: test7
+; CHECK-LABEL: L1:
+; CHECK: br i1 true, label %L1Latch, label %L2.preheader
+; CHECK-LABEL: L2.preheader:
+; CHECK-NEXT: br label %L1Latch.loopexit
+; CHECK-LABEL: L1Latch.loopexit:
+; CHECK: br label %L1Latch
+; CHECK-LABEL: L1Latch:
+; CHECK-NEXT: %y = phi i64 [ %y.next, %L1 ], [ %y.L2.lcssa, %L1Latch.loopexit ]
+; CHECK: br i1 %cond2, label %exit, label %L1
+entry: 
+  br label %L1
+
+L1:
+  %y.next = phi i64 [ 0, %entry ], [ %y.add, %L1Latch ]
+  br i1 true, label %L1Latch, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L2 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L2, label %L1Latch
+
+L1Latch:
+ %y = phi i64 [ %y.next, %L1 ], [ %y.L2, %L2 ]
+ %y.add = add i64 %y, %n
+ %cond2 = icmp eq i64 %y.add, 42
+ br i1 %cond2, label %exit, label %L1
+
+exit:
+ ret i64 %y.add
+}
+
+
+; Show recursive deletion of loops. Since we start with subloops and progress outward 
+; to parent loop, we first delete the loop L2. Now loop L1 becomes a non-loop since it's backedge
+; from L2's preheader to L1's exit block is never taken. So, L1 gets deleted as well.
+define void @test8(i64 %n) {
+; CHECK-LABEL: test8
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br label %exit
+; CHECK-LABEL: exit:
+; CHECK-NEXT: ret void
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L2 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L2, label %L1
+
+exit:
+ ret void
+}
+
+
+; Delete a loop (L2) which has subloop (L3).
+; Here we delete loop L2, but leave L3 as is.
+; FIXME: Can delete L3 as well, by iteratively going backward through the single
+; predecessor of L3 until we reach L1's block that guarantees L3 is never
+; executed.
+define void @test9(i64 %n) {
+; CHECK-LABEL: test9
+; CHECK-LABEL: L2.preheader:
+; CHECK-NEXT: br label %L3.preheader
+; CHECK-NOT: L2:
+; CHECK-LABEL: L3.preheader:
+; CHECK-NEXT: %y.L2.lcssa = phi i64 [ undef, %L2.preheader ]
+; CHECK-NEXT: br label %L3
+; CHECK-LABEL: L3:
+; CHECK: br i1 %cond2, label %L3, label %L1.loopexit
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L2 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L2, label %L3
+
+L3: 
+  %cond2 = icmp slt i64 %y.L2, %n
+  br i1 %cond2, label %L3, label %L1
+
+exit:
+ ret void
+}
+
+; We cannot delete L3 because of call within it.
+; Since L3 is not deleted, and entirely contained within L2, L2 is also not
+; deleted.
+; FIXME: We can delete unexecutable loops having
+; subloops contained entirely within them.
+define void @test10(i64 %n) {
+; CHECK-LABEL: test10
+; CHECK: L2:
+; CHECK: L3:
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L3 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L1, label %L3
+
+L3:
+  %y.L3 = phi i64 [ %y.L2, %L2 ], [ %y.L3.next, %L3 ]
+  %y.L3.next = add i64 %y.L3, 1
+  %dummy = call i64 @foo(i64 %y.L3.next)
+  %cond2 = icmp slt i64 %y.L3, %n
+  br i1 %cond2, label %L3, label %L2
+
+exit:
+ ret void
+}
+
+; same as test10, but L3 does not contain call.
+; So, in the first iteration, all statements of L3 are made invariant, and L3 is
+; deleted.
+; In the next iteration, since L2 is never executed and has no subloops, we delete
+; L2 as well. Finally, the outermost loop L1 is deleted.
+define void @test11(i64 %n) {
+; CHECK-LABEL: test11
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br label %exit
+; CHECK-LABEL: exit:
+; CHECK-NEXT: ret void
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L3 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L1, label %L3
+
+L3: 
+  %y.L3 = phi i64 [ %y.L2, %L2 ], [ %y.L3.next, %L3 ]
+  %y.L3.next = add i64 %y.L3, 1
+  %cond2 = icmp slt i64 %y.L3, %n
+  br i1 %cond2, label %L3, label %L2
+
+exit:
+ ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 82f2e064a581..e18159f24624 100644
--- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -36,7 +36,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
 ; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
diff --git a/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 2ce357540d0b..8ef59613e646 100644
--- a/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -17,9 +17,9 @@ target triple = "x86_64-pc_linux"
 ;}
 
 ;AVX512-LABEL: @foo1
-;AVX512: llvm.masked.load.v16i32
-;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.store.v16f32
+;AVX512: llvm.masked.load.v16i32.p0v16i32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.store.v16f32.p0v16f32
 ;AVX512: ret void
 
 ; Function Attrs: nounwind uwtable
@@ -96,8 +96,8 @@ for.end:                                          ; preds = %for.cond
 
 ;AVX512-LABEL: @foo2
 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.scatter.v16f32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
 ;AVX512: ret void
 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
 entry:
@@ -171,10 +171,10 @@ for.end:                                          ; preds = %for.cond
 
 ;AVX512-LABEL: @foo3
 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
 ;AVX512: fadd <16 x float>
 ;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.scatter.v16f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
 ;AVX512: ret void
 
 %struct.Out = type { float, float }
@@ -233,4 +233,194 @@ for.inc:                                          ; preds = %if.end
 for.end:                                          ; preds = %for.cond
   ret void
 }
-declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+
+; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
+
+;AVX512-LABEL: @foo2_addrspace
+;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p1f32
+;AVX512: llvm.masked.scatter.v16f32.v16p1f32
+;AVX512: ret void
+define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the input has the non-default address space.
+
+;AVX512-LABEL: @foo2_addrspace2
+;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p1f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
+;AVX512: ret void
+define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(0)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
+  store float %add, float addrspace(0)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the output has the non-default address space.
+
+;AVX512-LABEL: @foo2_addrspace3
+;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.scatter.v16f32.v16p1f32
+;AVX512: ret void
+
+define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca %struct.In addrspace(0)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(0)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index bda4b2454ee2..aff372b562fb 100755
--- a/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -23,11 +23,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
diff --git a/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll b/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll
index a3511c3ae968..b3087c1577ca 100644
--- a/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll
+++ b/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll
@@ -1,8 +1,8 @@
 ; XFAIL: *
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 
-declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
-declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
 
 ; This test ensures that masked scatter and gather operations, which take vectors of pointers,
 ; do not have pointer aliasing ignored when being processed.
@@ -21,18 +21,18 @@ entry:
   %tmp.i = insertelement <2 x i32*> undef, i32* %tmp.0, i32 0
   %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1
   ; Read from in1 and in2
-  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
-  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
   ; Store in1 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
   ; Read in1 from the allocas
   ; This gather should alias the scatter we just saw
-  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
   ; Store in2 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
   ; Read in2 from the allocas
   ; This gather should alias the scatter we just saw, and not be eliminated
-  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
   ; Store in2 to out for good measure
   %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0
   %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index c10c3b1381b7..ad44f9d6fd39 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -3049,6 +3049,6 @@ define void @test67(i8* %x) {
 !4 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
 !5 = !{i32 2, !"Debug Info Version", i32 3}
 
-; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes #0 = { nounwind readnone speculatable }
 ; CHECK: attributes [[NUW]] = { nounwind }
 ; CHECK: ![[RELEASE]] = !{}
diff --git a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index c856706d3f03..93a12a927d89 100644
--- a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -105,7 +105,7 @@ declare void @NSLog(i8*, ...)
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { ssp uwtable }
-; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes #1 = { nounwind readnone speculatable }
 ; CHECK: attributes #2 = { nonlazybind }
 ; CHECK: attributes #3 = { noinline ssp uwtable }
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll
index 923cbe74a567..03b1e837a0ca 100644
--- a/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/test/Transforms/SLPVectorizer/X86/call.ll
@@ -147,5 +147,5 @@ entry:
 ; CHECK: declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) [[ATTR0]]
 ; CHECK: declare <2 x double> @llvm.exp2.v2f64(<2 x double>) [[ATTR0]]
 
-; CHECK: attributes [[ATTR0]] = { nounwind readnone }
+; CHECK: attributes [[ATTR0]] = { nounwind readnone speculatable }
 
diff --git a/test/Transforms/SpeculativeExecution/spec-other.ll b/test/Transforms/SpeculativeExecution/spec-other.ll
new file mode 100644
index 000000000000..65e14b69e9e6
--- /dev/null
+++ b/test/Transforms/SpeculativeExecution/spec-other.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -S -speculative-execution \
+; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
+; RUN:   | FileCheck %s
+
+; CHECK-LABEL: @ifThen_extractvalue(
+; CHECK: extractvalue
+; CHECK: br i1 true
+define void @ifThen_extractvalue() {
+  br i1 true, label %a, label %b
+
+a:
+  %x = extractvalue { i32, i32 } undef, 0
+  br label %b
+
+b:
+  ret void
+}
+
+; CHECK-LABEL: @ifThen_insertvalue(
+; CHECK: insertvalue
+; CHECK: br i1 true
+define void @ifThen_insertvalue() {
+  br i1 true, label %a, label %b
+
+a:
+  %x = insertvalue { i32, i32 } undef, i32 undef, 0
+  br label %b
+
+b:
+  ret void
+}
+
diff --git a/test/Transforms/SpeculativeExecution/spec-vector.ll b/test/Transforms/SpeculativeExecution/spec-vector.ll
new file mode 100644
index 000000000000..9c64f1fb1005
--- /dev/null
+++ b/test/Transforms/SpeculativeExecution/spec-vector.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -S -speculative-execution \
+; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
+; RUN:   | FileCheck %s
+
+; CHECK-LABEL: @ifThen_extractelement_constindex(
+; CHECK: extractelement
+; CHECK: br i1 true
+define void @ifThen_extractelement_constindex() {
+  br i1 true, label %a, label %b
+
+a:
+  %x = extractelement <4 x i32> undef, i32 0
+  br label %b
+
+b:
+  ret void
+}
+
+; CHECK-LABEL: @ifThen_extractelement_varindex(
+; CHECK: extractelement
+; CHECK: br i1 true
+define void @ifThen_extractelement_varindex(i32 %idx) {
+  br i1 true, label %a, label %b
+
+a:
+  %x = extractelement <4 x i32> undef, i32 %idx
+  br label %b
+
+b:
+  ret void
+}
+
+; CHECK-LABEL: @ifThen_insertelement_constindex(
+; CHECK: insertelement
+; CHECK: br i1 true
+define void @ifThen_insertelement_constindex() {
+  br i1 true, label %a, label %b
+
+a:
+  %x = insertelement <4 x i32> undef, i32 undef, i32 0
+  br label %b
+
+b:
+  ret void
+}
+
+; CHECK-LABEL: @ifThen_insertelement_varindex(
+; CHECK: insertelement
+; CHECK: br i1 true
+define void @ifThen_insertelement_varindex(i32 %idx) {
+  br i1 true, label %a, label %b
+
+a:
+  %x = insertelement <4 x i32> undef, i32 undef, i32 %idx
+  br label %b
+
+b:
+  ret void
+}
+
+; CHECK-LABEL: @ifThen_shufflevector(
+; CHECK: shufflevector
+; CHECK: br i1 true
+define void @ifThen_shufflevector() {
+  br i1 true, label %a, label %b
+
+a:
+  %x = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> undef
+  br label %b
+
+b:
+  ret void
+}
diff --git a/test/Verifier/scatter_gather.ll b/test/Verifier/scatter_gather.ll
new file mode 100644
index 000000000000..3b1b0ee19fd9
--- /dev/null
+++ b/test/Verifier/scatter_gather.ll
@@ -0,0 +1,122 @@
+; RUN: not opt -verify < %s 2>&1 | FileCheck %s
+
+; Mask is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define <16 x float> @gather2(<16 x float*> %ptrs, <16 x i1>* %mask, <16 x float> %passthru) {
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1>* %mask, <16 x float> %passthru)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>*, <16 x float>)
+
+; Mask length != return length
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather3(<8 x float*> %ptrs, <16 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <16 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <16 x i1>, <8 x float>)
+
+; Return type is not a vector
+; CHECK: Intrinsic has incorrect return type!
+define <8 x float>* @gather4(<8 x float*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float>* @llvm.masked.gather.p0v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float>* %res
+}
+declare <8 x float>* @llvm.masked.gather.p0v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
+
+; Value type is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather5(<8 x float*>* %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.p0v8p0f32(<8 x float*>* %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.p0v8p0f32(<8 x float*>*, i32, <8 x i1>, <8 x float>)
+
+; Value type is not a vector of pointers
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather6(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, i32, <8 x i1>, <8 x float>)
+
+; Value element type != vector of pointers element
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather7(<8 x double*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f64(<8 x double*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x float>)
+
+; Value length!= vector of pointers length
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather8(<16 x float*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0f32(<16 x float*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v16p0f32(<16 x float*>, i32, <8 x i1>, <8 x float>)
+
+; Passthru type doesn't match return type 
+; CHECK: Intrinsic has incorrect argument type!
+define <16 x i32> @gather9(<16 x i32*> %ptrs, <16 x i1> %mask, <8 x i32> %passthru) {
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <8 x i32> %passthru)
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <8 x i32>)
+
+; Mask is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter2(<16 x float> %value, <16 x float*> %ptrs, <16 x i1>* %mask) {
+  call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %value, <16 x float*> %ptrs, i32 4, <16 x i1>* %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>*)
+
+; Mask length != value length
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter3(<8 x float> %value, <8 x float*> %ptrs, <16 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %value, <8 x float*> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <16 x i1>)
+
+; Value type is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter4(<8 x float>* %value, <8 x float*> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.p0v8f32.v8p0f32(<8 x float>* %value, <8 x float*> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.p0v8f32.v8p0f32(<8 x float>*, <8 x float*>, i32, <8 x i1>)
+
+; ptrs is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter5(<8 x float> %value, <8 x float*>* %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.p0v8p0f32(<8 x float> %value, <8 x float*>* %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.p0v8p0f32(<8 x float>, <8 x float*>*, i32, <8 x i1>)
+
+; Value type is not a vector of pointers
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter6(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, i32, <8 x i1>)
+
+; Value element type != vector of pointers element
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter7(<8 x float> %value, <8 x double*> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v8p0f64(<8 x float> %value, <8 x double*> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v8p0f64(<8 x float>, <8 x double*>, i32, <8 x i1>)
+
+; Value length!= vector of pointers length
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter8(<8 x float> %value, <16 x float*> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v16p0f32(<8 x float> %value, <16 x float*> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v16p0f32(<8 x float>, <16 x float*>, i32, <8 x i1>)
+