vendor/llvm/llvm-release_32-r168974

author: Dimitry Andric <dim@FreeBSD.org> 2012-12-02 13:10:19 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2012-12-02 13:10:19 +0000
commit: 522600a229b950314b5f4af84eba4f3e8a0ffea1 (patch)
tree: 32b4679ab4b8f28e5228daafc65e9dc436935353 /test
parent: 902a7b529820e6a0aa85f98f21afaeb1805a22f8 (diff)
750 files changed, 37921 insertions, 1953 deletions
diff --git a/test/Analysis/BasicAA/noalias-geps.ll b/test/Analysis/BasicAA/noalias-geps.ll
new file mode 100644
index 0000000000000..a93d778da0741
--- /dev/null
+++ b/test/Analysis/BasicAA/noalias-geps.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+; Check that geps with equal base offsets of noalias base pointers stay noalias.
+define i32 @test(i32* %p, i16 %i) {
+  %pi = getelementptr i32* %p, i32 0
+  %pi.next = getelementptr i32* %p, i32 1
+  %b = icmp eq i16 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr i32* %pi, i32 1
+  %g = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr i32* %pi, i32 1
+  %g2 = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+
+bb3:
+  %ptr_phi = phi i32* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi i32* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr i32* %ptr_phi , i32 1
+  %g1 = getelementptr i32* %ptr_phi2 , i32 1
+
+ret i32 0
+}
+
+; Check that geps with equal indices of noalias base pointers stay noalias.
+define i32 @test2([2 x i32]* %p, i32 %i) {
+  %pi = getelementptr [2 x i32]* %p, i32 0
+  %pi.next = getelementptr [2 x i32]* %p, i32 1
+  %b = icmp eq i32 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr [2 x i32]* %pi, i32 1
+  %g = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr [2 x i32]* %pi, i32 1
+  %g2 = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb3:
+  %ptr_phi = phi [2 x i32]* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi [2 x i32]* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr [2 x i32]* %ptr_phi , i32 1, i32 %i
+  %g1 = getelementptr [2 x i32]* %ptr_phi2 , i32 1, i32 %i
+
+ret i32 0
+}
diff --git a/test/Analysis/BasicAA/nocapture.ll b/test/Analysis/BasicAA/nocapture.ll
index a8658ec801acd..ffc0a09a078dc 100644
--- a/test/Analysis/BasicAA/nocapture.ll
+++ b/test/Analysis/BasicAA/nocapture.ll
@@ -13,3 +13,24 @@ define i32 @test2() {
        ret i32 %c
 }
 
+declare void @test3(i32** %p, i32* %q) nounwind
+
+define i32 @test4(i32* noalias nocapture %p) nounwind {
+; CHECK: call void @test3
+; CHECK: store i32 0, i32* %p
+; CHECK: store i32 1, i32* %x
+; CHECK: %y = load i32* %p
+; CHECK: ret i32 %y
+entry:
+       %q = alloca i32*
+       ; Here test3 might store %p to %q. This doesn't violate %p's nocapture
+       ; attribute since the copy doesn't outlive the function.
+       call void @test3(i32** %q, i32* %p) nounwind
+       store i32 0, i32* %p
+       %x = load i32** %q
+       ; This store might write to %p and so we can't eliminate the subsequent
+       ; load
+       store i32 1, i32* %x
+       %y = load i32* %p
+       ret i32 %y
+}
diff --git a/test/Analysis/BasicAA/phi-speculation.ll b/test/Analysis/BasicAA/phi-speculation.ll
new file mode 100644
index 0000000000000..21c65929862f0
--- /dev/null
+++ b/test/Analysis/BasicAA/phi-speculation.ll
@@ -0,0 +1,33 @@
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; ptr_phi and ptr2_phi do not alias.
+; CHECK: NoAlias: i32* %ptr2_phi, i32* %ptr_phi
+
+define i32 @test_noalias(i32* %ptr2, i32 %count, i32* %coeff) {
+entry:
+  %ptr = getelementptr inbounds i32* %ptr2, i64 1
+  br label %while.body
+
+while.body:
+  %num = phi i32 [ %count, %entry ], [ %dec, %while.body ]
+  %ptr_phi = phi i32* [ %ptr, %entry ], [ %ptr_inc, %while.body ]
+  %ptr2_phi = phi i32* [ %ptr2, %entry ], [ %ptr2_inc, %while.body ]
+  %result.09 = phi i32 [ 0 , %entry ], [ %add, %while.body ]
+  %dec = add nsw i32 %num, -1
+  %0 = load i32* %ptr_phi, align 4
+  store i32 %0, i32* %ptr2_phi, align 4
+  %1 = load i32* %coeff, align 4
+  %2 = load i32* %ptr_phi, align 4
+  %mul = mul nsw i32 %1, %2
+  %add = add nsw i32 %mul, %result.09
+  %tobool = icmp eq i32 %dec, 0
+  %ptr_inc = getelementptr inbounds i32* %ptr_phi, i64 1
+  %ptr2_inc = getelementptr inbounds i32* %ptr2_phi, i64 1
+  br i1 %tobool, label %the_exit, label %while.body
+
+the_exit:
+  ret i32 %add
+}
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 74d06a18f7b9f..08adfa8a36fb0 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -88,3 +88,30 @@ exit:
 }
 
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
+
+define i32 @test4(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Printing analysis {{.*}} for function 'test4'
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 2, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !2
+; CHECK: edge entry -> return probability is 7 / 85
+; CHECK: edge entry -> sw.bb probability is 14 / 85
+; CHECK: edge entry -> sw.bb1 probability is 64 / 85
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!2 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/Analysis/CallGraph/do-nothing-intrinsic.ll b/test/Analysis/CallGraph/do-nothing-intrinsic.ll
new file mode 100644
index 0000000000000..f28ad10f57c8a
--- /dev/null
+++ b/test/Analysis/CallGraph/do-nothing-intrinsic.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -basiccg
+; PR13903
+
+define void @main() {
+  invoke void @llvm.donothing()
+          to label %ret unwind label %unw
+unw:
+  %tmp = landingpad i8 personality i8 0 cleanup
+  br label %ret
+ret:
+  ret void
+}
+declare void @llvm.donothing() nounwind readnone
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
new file mode 100644
index 0000000000000..37cca8d540670
--- /dev/null
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} add
+  %A = add <4 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %B = add <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} add
+  %C = add <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %D = add <4 x i64> undef, undef
+  ;CHECK: cost of 8 {{.*}} add
+  %E = add <8 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @xor(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} xor
+  %A = xor <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %B = xor <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %C = xor <2 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %D = xor <4 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @fmul(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} fmul
+  %A = fmul <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fmul
+  %B = fmul <8 x float> undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
new file mode 100644
index 0000000000000..75c97a781e7fa
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+
+  ; -- Same size registeres --
+  ;CHECK: cost of 1 {{.*}} zext
+  %A = zext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 2 {{.*}} sext
+  %B = sext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 0 {{.*}} trunc
+  %C = trunc <4 x i32> undef to <4 x i1>
+
+  ; -- Different size registers --
+  ;CHECK-NOT: cost of 1 {{.*}} zext
+  %D = zext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} sext
+  %E = sext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i1>
+
+  ; -- scalars --
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %G = zext i1 undef to i32
+  ;CHECK: cost of 0 {{.*}} trunc
+  %H = trunc i32 undef to i1
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+define i32 @zext_sext(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+
+  ;CHECK: cost of 1 {{.*}} sext
+  %A = sext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} zext
+  %B = zext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} sext
+  %C = sext <4 x i32> undef to <4 x i64>
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %D = zext <4 x i32> undef to <4 x i64>
+  ;CHECK: cost of 1 {{.*}} trunc
+
+  %E = trunc <4 x i64> undef to <4 x i32>
+  ;CHECK: cost of 1 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i16>
+
+  ;CHECK: cost of 3 {{.*}} trunc
+  %G = trunc <8 x i64> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+define i32 @masks(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll
new file mode 100644
index 0000000000000..f868bd18b54fc
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cmp.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @cmp(i32 %arg) {
+  ;  -- floats --
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %A = fcmp olt <2 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %B = fcmp olt <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %C = fcmp olt <8 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %D = fcmp olt <2 x double> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %E = fcmp olt <4 x double> undef, undef
+
+  ;  -- integers --
+
+  ;CHECK: cost of 1 {{.*}} icmp
+  %F = icmp eq <16 x i8> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %G = icmp eq <8 x i16> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %H = icmp eq <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %I = icmp eq <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %J = icmp eq <4 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %K = icmp eq <8 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %L = icmp eq <16 x i16> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %M = icmp eq <32 x i8> undef, undef
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
diff --git a/test/Analysis/CostModel/X86/i32.ll b/test/Analysis/CostModel/X86/i32.ll
new file mode 100644
index 0000000000000..4015e0b1eef4b
--- /dev/null
+++ b/test/Analysis/CostModel/X86/i32.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s
+
+
+;CHECK: cost of 2 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i64 undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/insert-extract-at-zero.ll b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
new file mode 100644
index 0000000000000..87bf7c488b918
--- /dev/null
+++ b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @insert-extract-at-zero-idx(i32 %arg, float %fl) {
+  ;CHECK: cost of 0 {{.*}} extract
+  %A = extractelement <4 x float> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %B = extractelement <4 x i32> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %C = extractelement <4 x float> undef, i32 1
+
+  ;CHECK: cost of 0 {{.*}} extract
+  %D = extractelement <8 x float> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %E = extractelement <8 x float> undef, i32 1
+
+  ;CHECK: cost of 1 {{.*}} extract
+  %F = extractelement <8 x float> undef, i32 %arg
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %G = insertelement <4 x float> undef, float %fl, i32 0
+  ;CHECK: cost of 1 {{.*}} insert
+  %H = insertelement <4 x float> undef, float %fl, i32 1
+  ;CHECK: cost of 1 {{.*}} insert
+  %I = insertelement <4 x i32> undef, i32 %arg, i32 0
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %J = insertelement <4 x double> undef, double undef, i32 0
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %K = insertelement <8 x double> undef, double undef, i32 4
+  ;CHECK: cost of 0 {{.*}} insert
+  %L = insertelement <16 x double> undef, double undef, i32 8
+  ;CHECK: cost of 1 {{.*}} insert
+  %M = insertelement <16 x double> undef, double undef, i32 9
+  ret i32 0
+}
+
diff --git a/test/Analysis/CostModel/X86/lit.local.cfg b/test/Analysis/CostModel/X86/lit.local.cfg
new file mode 100644
index 0000000000000..a8ad0f1a28b23
--- /dev/null
+++ b/test/Analysis/CostModel/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Analysis/CostModel/X86/loop_v2.ll b/test/Analysis/CostModel/X86/loop_v2.ll
new file mode 100644
index 0000000000000..260a60676ab7c
--- /dev/null
+++ b/test/Analysis/CostModel/X86/loop_v2.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define i32 @foo(i32* nocapture %A) nounwind uwtable readonly ssp {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
+  %0 = getelementptr inbounds i32* %A, i64 %index
+  %1 = bitcast i32* %0 to <2 x i32>*
+  %2 = load <2 x i32>* %1, align 4
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ;CHECK: cost of 1 {{.*}} extract
+  %4 = extractelement <2 x i64> %3, i32 0
+  %5 = getelementptr inbounds i32* %A, i64 %4
+  ;CHECK: cost of 1 {{.*}} extract
+  %6 = extractelement <2 x i64> %3, i32 1
+  %7 = getelementptr inbounds i32* %A, i64 %6
+  %8 = load i32* %5, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} insert
+  %9 = insertelement <2 x i32> undef, i32 %8, i32 0
+  %10 = load i32* %7, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} insert
+  %11 = insertelement <2 x i32> %9, i32 %10, i32 1
+  %12 = add nsw <2 x i32> %11, %vec.phi
+  %index.next = add i64 %index, 2
+  %13 = icmp eq i64 %index.next, 192
+  br i1 %13, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  %14 = extractelement <2 x i32> %12, i32 0
+  %15 = extractelement <2 x i32> %12, i32 1
+  %16 = add i32 %14, %15
+  ret i32 %16
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/CostModel/X86/tiny.ll b/test/Analysis/CostModel/X86/tiny.ll
new file mode 100644
index 0000000000000..cc7b443a7dfc8
--- /dev/null
+++ b/test/Analysis/CostModel/X86/tiny.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: cost of 1 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i32 %arg, %arg
+  ret i32 %e
+}
diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll
new file mode 100644
index 0000000000000..7919a9ca9a64f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @foo(i32* noalias nocapture %A, i32* noalias nocapture %B, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  ;CHECK: cost of 1 {{.*}} icmp
+  %cmp7 = icmp slt i32 %start, %end
+  br i1 %cmp7, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  ;CHECK: cost of 1 {{.*}} sext
+  %0 = sext i32 %start to i64
+  %1 = sub i32 %end, %start
+  %2 = zext i32 %1 to i64
+  %end.idx = add i64 %2, %0
+  ;CHECK: cost of 1 {{.*}} add
+  %n.vec = and i64 %2, 4294967288
+  %end.idx.rnd.down = add i64 %n.vec, %0
+  ;CHECK: cost of 1 {{.*}} icmp
+  %cmp.zero = icmp eq i64 %n.vec, 0
+  br i1 %cmp.zero, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %for.body.lr.ph, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ %0, %for.body.lr.ph ]
+  %3 = add i64 %index, 2
+  %4 = getelementptr inbounds i32* %B, i64 %3
+  ;CHECK: cost of 0 {{.*}} bitcast
+  %5 = bitcast i32* %4 to <8 x i32>*
+  ;CHECK: cost of 1 {{.*}} load
+  %6 = load <8 x i32>* %5, align 4
+  ;CHECK: cost of 4 {{.*}} mul
+  %7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %8 = getelementptr inbounds i32* %A, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  %10 = load <8 x i32>* %9, align 4
+  ;CHECK: cost of 4 {{.*}} add
+  %11 = add nsw <8 x i32> %10, %7
+  ;CHECK: cost of 1 {{.*}} store
+  store <8 x i32> %11, <8 x i32>* %9, align 4
+  %index.next = add i64 %index, 8
+  %12 = icmp eq i64 %index.next, %end.idx.rnd.down
+  ;CHECK: cost of 1 {{.*}} br
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %for.body.lr.ph
+  %cmp.n = icmp eq i64 %end.idx, %end.idx.rnd.down
+  br i1 %cmp.n, label %for.end, label %for.body
+
+for.body:                                         ; preds = %middle.block, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ]
+  %13 = add nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32* %B, i64 %13
+  ;CHECK: cost of 1 {{.*}} load
+  %14 = load i32* %arrayidx, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} mul
+  %mul = mul nsw i32 %14, 5
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  ;CHECK: cost of 1 {{.*}} load
+  %15 = load i32* %arrayidx2, align 4, !tbaa !0
+  %add3 = add nsw i32 %15, %mul
+  store i32 %add3, i32* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  ;CHECK: cost of 0 {{.*}} trunc
+  %16 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %16, %end
+  ;CHECK: cost of 1 {{.*}} br
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body, %entry
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/LoopDependenceAnalysis/lit.local.cfg b/test/Analysis/CostModel/lit.local.cfg
index 19eebc0ac7ac3..19eebc0ac7ac3 100644
--- a/test/Analysis/LoopDependenceAnalysis/lit.local.cfg
+++ b/test/Analysis/CostModel/lit.local.cfg
diff --git a/test/Analysis/CostModel/no_info.ll b/test/Analysis/CostModel/no_info.ll
new file mode 100644
index 0000000000000..d20d56b79a7f0
--- /dev/null
+++ b/test/Analysis/CostModel/no_info.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -cost-model -analyze | FileCheck %s
+
+; The cost model does not have any target information so it can't make a decision.
+; Notice that OPT does not read the triple information from the module itself, only through the command line.
+
+; This info ignored:
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: Unknown cost {{.*}} add
+;CHECK: Unknown cost {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i32 %arg, %arg
+  ret i32 %e
+}
diff --git a/test/Analysis/DependenceAnalysis/Banerjee.ll b/test/Analysis/DependenceAnalysis/Banerjee.ll
new file mode 100644
index 0000000000000..8865ee94016fa
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -0,0 +1,595 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Banerjee.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 1; i <= 10; i++)
+;;    for (long int j = 1; j <= 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 1];
+
+define void @banerjee0(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 1, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 1, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 11
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 11
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 1; i <= n; i++)
+;;    for (long int j = 1; j <= m; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 1];
+
+define void @banerjee1(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end9
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  %0 = add i64 %n, 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc7
+  %B.addr.06 = phi i64* [ %B.addr.1.lcssa, %for.inc7 ], [ %B, %for.cond1.preheader.preheader ]
+  %i.05 = phi i64 [ %inc8, %for.inc7 ], [ 1, %for.cond1.preheader.preheader ]
+  %1 = add i64 %m, 1
+  %cmp21 = icmp sgt i64 %m, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc7
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 1, %for.body3.preheader ]
+  %B.addr.12 = phi i64* [ %incdec.ptr, %for.body3 ], [ %B.addr.06, %for.body3.preheader ]
+  %mul = mul nsw i64 %i.05, 10
+  %add = add nsw i64 %mul, %j.03
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.05, 10
+  %add5 = add nsw i64 %mul4, %j.03
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %2 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [* <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.12, i64 1
+  store i64 %2, i64* %B.addr.12, align 8
+  %inc = add nsw i64 %j.03, 1
+  %exitcond = icmp eq i64 %inc, %1
+  br i1 %exitcond, label %for.inc7.loopexit, label %for.body3
+
+for.inc7.loopexit:                                ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.06, i64 %m
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.inc7.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.06, %for.cond1.preheader ], [ %scevgep, %for.inc7.loopexit ]
+  %inc8 = add nsw i64 %i.05, 1
+  %exitcond7 = icmp eq i64 %inc8, %0
+  br i1 %exitcond7, label %for.end9.loopexit, label %for.cond1.preheader
+
+for.end9.loopexit:                                ; preds = %for.inc7
+  br label %for.end9
+
+for.end9:                                         ; preds = %for.end9.loopexit, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = 0;
+;;      *B++ = A[10*i + j + 100];
+
+define void @banerjee2(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 100
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 99];
+
+define void @banerjee3(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 99
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> >]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 100];
+
+define void @banerjee4(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -100
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 99];
+
+define void @banerjee5(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -99
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [< <]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 9];
+
+define void @banerjee6(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 9
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [=> <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 10];
+
+define void @banerjee7(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 10
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> <=]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 11];
+
+define void @banerjee8(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 11
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[30*i + 500*j] = ...
+;;      ... = A[i - 500*j + 11];
+
+define void @banerjee9(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 30
+  %mul4 = mul nsw i64 %j.02, 500
+  %add = add nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %0 = mul i64 %j.02, -500
+  %sub = add i64 %i.03, %0
+  %add6 = add nsw i64 %sub, 11
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %1 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [<= =|<]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %1, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[i + 500*j] = ...
+;;      ... = A[i - 500*j + 11];
+
+define void @banerjee10(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %j.02, 500
+  %add = add nsw i64 %i.03, %mul
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %0 = mul i64 %j.02, -500
+  %sub = add i64 %i.03, %0
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %1 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<> =]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %1, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[300*i + j] = ...
+;;      ... = A[250*i - j + 11];
+
+define void @banerjee11(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 300
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 250
+  %sub = sub nsw i64 %mul4, %j.02
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[100*i + j] = ...
+;;      ... = A[100*i - j + 11];
+
+define void @banerjee12(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 100
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 100
+  %sub = sub nsw i64 %mul4, %j.02
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Coupled.ll b/test/Analysis/DependenceAnalysis/Coupled.ll
new file mode 100644
index 0000000000000..60163fe7c2d0b
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Coupled.ll
@@ -0,0 +1,509 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Coupled.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][i] = ...
+;;   ... = A[i + 10][i + 9]
+
+define void @couple0([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  store i32 %conv, i32* %arrayidx1, align 4
+  %add = add nsw i64 %i.02, 9
+  %add2 = add nsw i64 %i.02, 10
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %add2, i64 %add
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][i] = ...
+;;   ... = A[i + 9][i + 9]
+
+define void @couple1([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  store i32 %conv, i32* %arrayidx1, align 4
+  %add = add nsw i64 %i.02, 9
+  %add2 = add nsw i64 %i.02, 9
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %add2, i64 %add
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - consistent flow [-9]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - 6] = ...
+;;   ... = A[i][i]
+
+define void @couple2([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -6
+  %mul1 = mul nsw i64 %i.02, 3
+  %sub2 = add nsw i64 %mul1, -6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %sub2, i64 %sub
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - 5] = ...
+;;   ... = A[i][i]
+
+define void @couple3([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -5
+  %mul1 = mul nsw i64 %i.02, 3
+  %sub2 = add nsw i64 %mul1, -6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %sub2, i64 %sub
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - n] = ...
+;;   ... = A[i][i]
+
+define void @couple4([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %conv1 = sext i32 %n to i64
+  %sub = sub nsw i64 %mul, %conv1
+  %mul2 = mul nsw i64 %i.02, 3
+  %sub3 = add nsw i64 %mul2, -6
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %sub3, i64 %sub
+  store i32 %conv, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - n + 1][3*i - n] = ...
+;;   ... = A[i][i]
+
+define void @couple5([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %conv1 = sext i32 %n to i64
+  %sub = sub nsw i64 %mul, %conv1
+  %mul2 = mul nsw i64 %i.02, 3
+  %conv3 = sext i32 %n to i64
+  %sub4 = sub nsw i64 %mul2, %conv3
+  %add = add nsw i64 %sub4, 1
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %add, i64 %sub
+  store i32 %conv, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][3*i - 6] = ...
+;;   ... = A[i][i]
+
+define void @couple6([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -6
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %sub
+  store i32 %conv, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - flow [=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][3*i - 5] = ...
+;;   ... = A[i][i]
+
+define void @couple7([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -5
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %sub
+  store i32 %conv, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][3 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple8([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 3, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][2 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple9([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 2, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][6 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple10([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 6, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 3!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][18 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple11([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 18, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [=|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 9!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 12; i++)
+;;   A[3*i - 18][22 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple12([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 11!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 12; i++)
+;;   A[3*i - 18][22 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple13([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 100; i++)
+;;   A[3*i - 18][18 - i][i] = ...
+;;   ... = A[i][i][i]
+
+define void @couple14([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 18, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub1, i64 %sub, i64 %i.02
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.02, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [=|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 9!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 100; i++)
+;;   A[3*i - 18][22 - i][i] = ...
+;;   ... = A[i][i][i]
+
+define void @couple15([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub1, i64 %sub, i64 %i.02
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.02, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ExactRDIV.ll b/test/Analysis/DependenceAnalysis/ExactRDIV.ll
new file mode 100644
index 0000000000000..aa5d254a0ce22
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ExactRDIV.ll
@@ -0,0 +1,508 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ExactRDIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    A[4*i + 10] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[2*j + 1];
+
+define void @rdiv0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 2
+  %add = add nsw i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %mul5 = shl nsw i64 %j.02, 1
+  %add64 = or i64 %mul5, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add64
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc9 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc9, 10
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[j];
+
+define void @rdiv1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[j];
+
+define void @rdiv2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[j];
+
+define void @rdiv3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[j];
+
+define void @rdiv4(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv5(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv6(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv7(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv8(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j < 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv9(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 5
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j <= 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv10(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 6
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    for (long int j = 0; j <= 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv11(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 11
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 5
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j < 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv12(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 11
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 6
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ExactSIV.ll b/test/Analysis/DependenceAnalysis/ExactSIV.ll
new file mode 100644
index 0000000000000..71e050246291b
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ExactSIV.ll
@@ -0,0 +1,428 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ExactSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[i + 10] = ...
+;;    ... = A[2*i + 1];
+
+define void @exact0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %add13 = or i64 %mul, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add13
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[4*i + 10] = ...
+;;    ... = A[2*i + 1];
+
+define void @exact1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 2
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %add23 = or i64 %mul1, 1
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add23
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 10; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 12; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact4(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 12; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact5(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 18; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact6(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 18
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 18; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact7(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact8(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 10; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact9(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 12; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact10(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 12; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact11(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 18; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact12(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 18
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 18; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact13(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/GCD.ll b/test/Analysis/DependenceAnalysis/GCD.ll
new file mode 100644
index 0000000000000..94c93a8a0dd48
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/GCD.ll
@@ -0,0 +1,597 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'GCD.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j] = ...
+;;      ... = A[6*i + 8*j];
+
+define void @gcd0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add = add nsw i64 %mul5, %mul6
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - flow [=> *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j] = ...
+;;      ... = A[6*i + 8*j + 1];
+
+define void @gcd1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc9
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add = add nsw i64 %mul5, %mul6
+  %add7 = or i64 %add, 1
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc10 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc10, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j + 1] = ...
+;;      ... = A[6*i + 8*j];
+
+define void @gcd2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc9
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %add5 = or i64 %sub, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add5
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add7 = add nsw i64 %mul5, %mul6
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc10 = add nsw i64 %i.03, 1
+  %exitcond6 = icmp ne i64 %inc10, 100
+  br i1 %exitcond6, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 2*j] = ...
+;;      ... = A[i + 2*j - 1];
+
+define void @gcd3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %j.02, 1
+  %add = add nsw i64 %i.03, %mul
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul4 = shl nsw i64 %j.02, 1
+  %add5 = add nsw i64 %i.03, %mul4
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [<> *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  void gcd4(int *A, int *B, long int M, long int N) {
+;;    for (long int i = 0; i < 100; i++)
+;;      for (long int j = 0; j < 100; j++) {
+;;        A[5*i + 10*j*M + 9*M*N] = i;
+;;        *B++ = A[15*i + 20*j*M - 21*N*M + 4];
+
+define void @gcd4(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc17
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc17 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %mul4 = mul nsw i64 %j.02, 10
+  %mul5 = mul nsw i64 %mul4, %M
+  %add = add nsw i64 %mul, %mul5
+  %mul6 = mul nsw i64 %M, 9
+  %mul7 = mul nsw i64 %mul6, %N
+  %add8 = add nsw i64 %add, %mul7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add8
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul9 = mul nsw i64 %i.03, 15
+  %mul10 = mul nsw i64 %j.02, 20
+  %mul11 = mul nsw i64 %mul10, %M
+  %add12 = add nsw i64 %mul9, %mul11
+  %mul13 = mul nsw i64 %N, 21
+  %mul14 = mul nsw i64 %mul13, %M
+  %sub = sub nsw i64 %add12, %mul14
+  %add15 = add nsw i64 %sub, 4
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %add15
+  %0 = load i32* %arrayidx16, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc18 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc18, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end19
+
+for.end19:                                        ; preds = %for.inc17
+  ret void
+}
+
+
+;;  void gcd5(int *A, int *B, long int M, long int N) {
+;;    for (long int i = 0; i < 100; i++)
+;;      for (long int j = 0; j < 100; j++) {
+;;        A[5*i + 10*j*M + 9*M*N] = i;
+;;        *B++ = A[15*i + 20*j*M - 21*N*M + 5];
+
+define void @gcd5(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc17
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc17 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %mul4 = mul nsw i64 %j.02, 10
+  %mul5 = mul nsw i64 %mul4, %M
+  %add = add nsw i64 %mul, %mul5
+  %mul6 = mul nsw i64 %M, 9
+  %mul7 = mul nsw i64 %mul6, %N
+  %add8 = add nsw i64 %add, %mul7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add8
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul9 = mul nsw i64 %i.03, 15
+  %mul10 = mul nsw i64 %j.02, 20
+  %mul11 = mul nsw i64 %mul10, %M
+  %add12 = add nsw i64 %mul9, %mul11
+  %mul13 = mul nsw i64 %N, 21
+  %mul14 = mul nsw i64 %mul13, %M
+  %sub = sub nsw i64 %add12, %mul14
+  %add15 = add nsw i64 %sub, 5
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %add15
+  %0 = load i32* %arrayidx16, align 4
+; CHECK: da analyze - flow [<> *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc18 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc18, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end19
+
+for.end19:                                        ; preds = %for.inc17
+  ret void
+}
+
+
+;;  void gcd6(long int n, int A[][n], int *B) {
+;;    for (long int i = 0; i < n; i++)
+;;      for (long int j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd6(i64 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end12
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
+  %i.06 = phi i64 [ %inc11, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc10 ], [ %B, %for.cond1.preheader.preheader ]
+  %cmp21 = icmp sgt i64 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc10
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 0, %for.body3.preheader ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %conv = trunc i64 %i.06 to i32
+  %mul = shl nsw i64 %j.03, 2
+  %mul4 = shl nsw i64 %i.06, 1
+  %0 = mul nsw i64 %mul4, %n
+  %arrayidx.sum = add i64 %0, %mul
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  store i32 %conv, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i64 %j.03, 6
+  %add7 = or i64 %mul6, 1
+  %mul7 = shl nsw i64 %i.06, 3
+  %1 = mul nsw i64 %mul7, %n
+  %arrayidx8.sum = add i64 %1, %add7
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %arrayidx8.sum
+  %2 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %2, i32* %B.addr.12, align 4
+  %inc = add nsw i64 %j.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body3, label %for.inc10.loopexit
+
+for.inc10.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %n
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.inc10.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc10.loopexit ]
+  %inc11 = add nsw i64 %i.06, 1
+  %exitcond8 = icmp ne i64 %inc11, %n
+  br i1 %exitcond8, label %for.cond1.preheader, label %for.end12.loopexit
+
+for.end12.loopexit:                               ; preds = %for.inc10
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.end12.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd7(int n, int A[][n], int *B) {
+;;    for (int i = 0; i < n; i++)
+;;      for (int j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd7(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %0 = zext i32 %n to i64
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %indvars.iv8 = phi i64 [ 0, %for.cond1.preheader.preheader ], [ %indvars.iv.next9, %for.inc13 ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %1 = add i32 %n, -1
+  %2 = zext i32 %1 to i64
+  %3 = add i64 %2, 1
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc13
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %4 = trunc i64 %indvars.iv to i32
+  %mul = shl nsw i32 %4, 2
+  %idxprom = sext i32 %mul to i64
+  %5 = trunc i64 %indvars.iv8 to i32
+  %mul4 = shl nsw i32 %5, 1
+  %idxprom5 = sext i32 %mul4 to i64
+  %6 = mul nsw i64 %idxprom5, %0
+  %arrayidx.sum = add i64 %6, %idxprom
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  %7 = trunc i64 %indvars.iv8 to i32
+  store i32 %7, i32* %arrayidx6, align 4
+  %8 = trunc i64 %indvars.iv to i32
+  %mul7 = mul nsw i32 %8, 6
+  %add7 = or i32 %mul7, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %9 = trunc i64 %indvars.iv8 to i32
+  %mul9 = shl nsw i32 %9, 3
+  %idxprom10 = sext i32 %mul9 to i64
+  %10 = mul nsw i64 %idxprom10, %0
+  %arrayidx11.sum = add i64 %10, %idxprom8
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %arrayidx11.sum
+  %11 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %11, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %3
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %indvars.iv.next9 = add i64 %indvars.iv8, 1
+  %lftr.wideiv10 = trunc i64 %indvars.iv.next9 to i32
+  %exitcond11 = icmp ne i32 %lftr.wideiv10, %n
+  br i1 %exitcond11, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd8(int n, int *A, int *B) {
+;;    for (int i = 0; i < n; i++)
+;;      for (int j = 0; j < n; j++) {
+;;        A[n*2*i + 4*j] = i;
+;;        *B++ = A[n*8*i + 6*j + 1];
+
+define void @gcd8(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %i.06 = phi i32 [ %inc14, %for.inc13 ], [ 0, %for.cond1.preheader.preheader ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %0 = add i32 %n, -1
+  %1 = zext i32 %0 to i64
+  %2 = add i64 %1, 1
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc13
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %mul = shl nsw i32 %n, 1
+  %mul4 = mul nsw i32 %mul, %i.06
+  %3 = trunc i64 %indvars.iv to i32
+  %mul5 = shl nsw i32 %3, 2
+  %add = add nsw i32 %mul4, %mul5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.06, i32* %arrayidx, align 4
+  %mul6 = shl nsw i32 %n, 3
+  %mul7 = mul nsw i32 %mul6, %i.06
+  %4 = trunc i64 %indvars.iv to i32
+  %mul8 = mul nsw i32 %4, 6
+  %add9 = add nsw i32 %mul7, %mul8
+  %add10 = or i32 %add9, 1
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %idxprom11
+  %5 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %5, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %2
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %inc14 = add nsw i32 %i.06, 1
+  %exitcond7 = icmp ne i32 %inc14, %n
+  br i1 %exitcond7, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd9(unsigned n, int A[][n], int *B) {
+;;    for (unsigned i = 0; i < n; i++)
+;;      for (unsigned j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd9(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %0 = zext i32 %n to i64
+  %cmp4 = icmp eq i32 %n, 0
+  br i1 %cmp4, label %for.end15, label %for.cond1.preheader.preheader
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %indvars.iv8 = phi i64 [ 0, %for.cond1.preheader.preheader ], [ %indvars.iv.next9, %for.inc13 ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %1 = add i32 %n, -1
+  %2 = zext i32 %1 to i64
+  %3 = add i64 %2, 1
+  %cmp21 = icmp eq i32 %n, 0
+  br i1 %cmp21, label %for.inc13, label %for.body3.preheader
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %4 = trunc i64 %indvars.iv to i32
+  %mul = shl i32 %4, 2
+  %idxprom = zext i32 %mul to i64
+  %5 = trunc i64 %indvars.iv8 to i32
+  %mul4 = shl i32 %5, 1
+  %idxprom5 = zext i32 %mul4 to i64
+  %6 = mul nsw i64 %idxprom5, %0
+  %arrayidx.sum = add i64 %6, %idxprom
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  %7 = trunc i64 %indvars.iv8 to i32
+  store i32 %7, i32* %arrayidx6, align 4
+  %8 = trunc i64 %indvars.iv to i32
+  %mul7 = mul i32 %8, 6
+  %add7 = or i32 %mul7, 1
+  %idxprom8 = zext i32 %add7 to i64
+  %9 = trunc i64 %indvars.iv8 to i32
+  %mul9 = shl i32 %9, 3
+  %idxprom10 = zext i32 %mul9 to i64
+  %10 = mul nsw i64 %idxprom10, %0
+  %arrayidx11.sum = add i64 %10, %idxprom8
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %arrayidx11.sum
+  %11 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %11, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %3
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %indvars.iv.next9 = add i64 %indvars.iv8, 1
+  %lftr.wideiv10 = trunc i64 %indvars.iv.next9 to i32
+  %exitcond11 = icmp ne i32 %lftr.wideiv10, %n
+  br i1 %exitcond11, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Preliminary.ll b/test/Analysis/DependenceAnalysis/Preliminary.ll
new file mode 100644
index 0000000000000..3ef63fd5592ff
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Preliminary.ll
@@ -0,0 +1,469 @@
+; RUN: opt < %s -analyze -basicaa -indvars -da | FileCheck %s
+
+; This series of tests is more interesting when debugging is enabled.
+
+; ModuleID = 'Preliminary.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;; may alias
+;; int p0(int n, int *A, int *B) {
+;;  A[0] = n;
+;;  return B[1];
+
+define i32 @p0(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  store i32 %n, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 1
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - confused!
+  ret i32 %0
+}
+
+
+;; no alias
+;; int p1(int n, int *restrict A, int *restrict B) {
+;;  A[0] = n;
+;;  return B[1];
+
+define i32 @p1(i32 %n, i32* noalias %A, i32* noalias %B) nounwind uwtable ssp {
+entry:
+  store i32 %n, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 1
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  ret i32 %0
+}
+
+;; check loop nesting levels
+;;  for (long int i = 0; i < n; i++)
+;;    for (long int j = 0; j < n; j++)
+;;      for (long int k = 0; k < n; k++)
+;;        A[i][j][k] = ...
+;;      for (long int k = 0; k < n; k++)
+;;        ... = A[i + 3][j + 2][k + 1];
+
+define void @p2(i64 %n, [100 x [100 x i64]]* %A, i64* %B) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader, label %for.end26
+
+for.cond1.preheader:                              ; preds = %for.inc24, %entry
+  %B.addr.012 = phi i64* [ %B.addr.1.lcssa, %for.inc24 ], [ %B, %entry ]
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %entry ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader, label %for.inc24
+
+for.cond4.preheader:                              ; preds = %for.inc21, %for.cond1.preheader
+  %B.addr.18 = phi i64* [ %B.addr.2.lcssa, %for.inc21 ], [ %B.addr.012, %for.cond1.preheader ]
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond1.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6, label %for.cond10.loopexit
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.cond4.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]]* %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, i64* %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %cmp5 = icmp slt i64 %inc, %n
+  br i1 %cmp5, label %for.body6, label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.body6, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12, label %for.inc21
+
+for.body12:                                       ; preds = %for.body12, %for.cond10.loopexit
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.cond10.loopexit ]
+  %B.addr.24 = phi i64* [ %incdec.ptr, %for.body12 ], [ %B.addr.18, %for.cond10.loopexit ]
+  %add = add nsw i64 %k9.05, 1
+  %add13 = add nsw i64 %j.07, 2
+  %add14 = add nsw i64 %i.011, 3
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]]* %A, i64 %add14, i64 %add13, i64 %add
+  %0 = load i64* %arrayidx17, align 8
+; CHECK: da analyze - flow [-3 -2]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.24, i64 1
+  store i64 %0, i64* %B.addr.24, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %cmp11 = icmp slt i64 %inc19, %n
+  br i1 %cmp11, label %for.body12, label %for.inc21
+
+for.inc21:                                        ; preds = %for.body12, %for.cond10.loopexit
+  %B.addr.2.lcssa = phi i64* [ %B.addr.18, %for.cond10.loopexit ], [ %incdec.ptr, %for.body12 ]
+  %inc22 = add nsw i64 %j.07, 1
+  %cmp2 = icmp slt i64 %inc22, %n
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc21, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.012, %for.cond1.preheader ], [ %B.addr.2.lcssa, %for.inc21 ]
+  %inc25 = add nsw i64 %i.011, 1
+  %cmp = icmp slt i64 %inc25, %n
+  br i1 %cmp, label %for.cond1.preheader, label %for.end26
+
+for.end26:                                        ; preds = %for.inc24, %entry
+  ret void
+}
+
+
+;; classify subscripts
+;;  for (long int i = 0; i < n; i++)
+;;  for (long int j = 0; j < n; j++)
+;;  for (long int k = 0; k < n; k++)
+;;  for (long int l = 0; l < n; l++)
+;;  for (long int m = 0; m < n; m++)
+;;  for (long int o = 0; o < n; o++)
+;;  for (long int p = 0; p < n; p++)
+;;  for (long int q = 0; q < n; q++)
+;;  for (long int r = 0; r < n; r++)
+;;  for (long int s = 0; s < n; s++)
+;;  for (long int u = 0; u < n; u++)
+;;  for (long int t = 0; t < n; t++) {
+;;          A[i - 3] [j] [2] [k-1] [2*l + 1] [m] [p + q] [r + s] = ...
+;;    ... = A[i + 3] [2] [u] [1-k] [3*l - 1] [o] [1 + n] [t + 2];
+
+define void @p3(i64 %n, [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64* %B) nounwind uwtable ssp {
+entry:
+  %cmp44 = icmp sgt i64 %n, 0
+  br i1 %cmp44, label %for.cond1.preheader, label %for.end90
+
+for.cond1.preheader:                              ; preds = %for.inc88, %entry
+  %B.addr.046 = phi i64* [ %B.addr.1.lcssa, %for.inc88 ], [ %B, %entry ]
+  %i.045 = phi i64 [ %inc89, %for.inc88 ], [ 0, %entry ]
+  %cmp240 = icmp sgt i64 %n, 0
+  br i1 %cmp240, label %for.cond4.preheader, label %for.inc88
+
+for.cond4.preheader:                              ; preds = %for.inc85, %for.cond1.preheader
+  %B.addr.142 = phi i64* [ %B.addr.2.lcssa, %for.inc85 ], [ %B.addr.046, %for.cond1.preheader ]
+  %j.041 = phi i64 [ %inc86, %for.inc85 ], [ 0, %for.cond1.preheader ]
+  %cmp536 = icmp sgt i64 %n, 0
+  br i1 %cmp536, label %for.cond7.preheader, label %for.inc85
+
+for.cond7.preheader:                              ; preds = %for.inc82, %for.cond4.preheader
+  %B.addr.238 = phi i64* [ %B.addr.3.lcssa, %for.inc82 ], [ %B.addr.142, %for.cond4.preheader ]
+  %k.037 = phi i64 [ %inc83, %for.inc82 ], [ 0, %for.cond4.preheader ]
+  %cmp832 = icmp sgt i64 %n, 0
+  br i1 %cmp832, label %for.cond10.preheader, label %for.inc82
+
+for.cond10.preheader:                             ; preds = %for.inc79, %for.cond7.preheader
+  %B.addr.334 = phi i64* [ %B.addr.4.lcssa, %for.inc79 ], [ %B.addr.238, %for.cond7.preheader ]
+  %l.033 = phi i64 [ %inc80, %for.inc79 ], [ 0, %for.cond7.preheader ]
+  %cmp1128 = icmp sgt i64 %n, 0
+  br i1 %cmp1128, label %for.cond13.preheader, label %for.inc79
+
+for.cond13.preheader:                             ; preds = %for.inc76, %for.cond10.preheader
+  %B.addr.430 = phi i64* [ %B.addr.5.lcssa, %for.inc76 ], [ %B.addr.334, %for.cond10.preheader ]
+  %m.029 = phi i64 [ %inc77, %for.inc76 ], [ 0, %for.cond10.preheader ]
+  %cmp1424 = icmp sgt i64 %n, 0
+  br i1 %cmp1424, label %for.cond16.preheader, label %for.inc76
+
+for.cond16.preheader:                             ; preds = %for.inc73, %for.cond13.preheader
+  %B.addr.526 = phi i64* [ %B.addr.6.lcssa, %for.inc73 ], [ %B.addr.430, %for.cond13.preheader ]
+  %o.025 = phi i64 [ %inc74, %for.inc73 ], [ 0, %for.cond13.preheader ]
+  %cmp1720 = icmp sgt i64 %n, 0
+  br i1 %cmp1720, label %for.cond19.preheader, label %for.inc73
+
+for.cond19.preheader:                             ; preds = %for.inc70, %for.cond16.preheader
+  %B.addr.622 = phi i64* [ %B.addr.7.lcssa, %for.inc70 ], [ %B.addr.526, %for.cond16.preheader ]
+  %p.021 = phi i64 [ %inc71, %for.inc70 ], [ 0, %for.cond16.preheader ]
+  %cmp2016 = icmp sgt i64 %n, 0
+  br i1 %cmp2016, label %for.cond22.preheader, label %for.inc70
+
+for.cond22.preheader:                             ; preds = %for.inc67, %for.cond19.preheader
+  %B.addr.718 = phi i64* [ %B.addr.8.lcssa, %for.inc67 ], [ %B.addr.622, %for.cond19.preheader ]
+  %q.017 = phi i64 [ %inc68, %for.inc67 ], [ 0, %for.cond19.preheader ]
+  %cmp2312 = icmp sgt i64 %n, 0
+  br i1 %cmp2312, label %for.cond25.preheader, label %for.inc67
+
+for.cond25.preheader:                             ; preds = %for.inc64, %for.cond22.preheader
+  %B.addr.814 = phi i64* [ %B.addr.9.lcssa, %for.inc64 ], [ %B.addr.718, %for.cond22.preheader ]
+  %r.013 = phi i64 [ %inc65, %for.inc64 ], [ 0, %for.cond22.preheader ]
+  %cmp268 = icmp sgt i64 %n, 0
+  br i1 %cmp268, label %for.cond28.preheader, label %for.inc64
+
+for.cond28.preheader:                             ; preds = %for.inc61, %for.cond25.preheader
+  %B.addr.910 = phi i64* [ %B.addr.10.lcssa, %for.inc61 ], [ %B.addr.814, %for.cond25.preheader ]
+  %s.09 = phi i64 [ %inc62, %for.inc61 ], [ 0, %for.cond25.preheader ]
+  %cmp294 = icmp sgt i64 %n, 0
+  br i1 %cmp294, label %for.cond31.preheader, label %for.inc61
+
+for.cond31.preheader:                             ; preds = %for.inc58, %for.cond28.preheader
+  %u.06 = phi i64 [ %inc59, %for.inc58 ], [ 0, %for.cond28.preheader ]
+  %B.addr.105 = phi i64* [ %B.addr.11.lcssa, %for.inc58 ], [ %B.addr.910, %for.cond28.preheader ]
+  %cmp321 = icmp sgt i64 %n, 0
+  br i1 %cmp321, label %for.body33, label %for.inc58
+
+for.body33:                                       ; preds = %for.body33, %for.cond31.preheader
+  %t.03 = phi i64 [ %inc, %for.body33 ], [ 0, %for.cond31.preheader ]
+  %B.addr.112 = phi i64* [ %incdec.ptr, %for.body33 ], [ %B.addr.105, %for.cond31.preheader ]
+  %add = add nsw i64 %r.013, %s.09
+  %add34 = add nsw i64 %p.021, %q.017
+  %mul = shl nsw i64 %l.033, 1
+  %add3547 = or i64 %mul, 1
+  %sub = add nsw i64 %k.037, -1
+  %sub36 = add nsw i64 %i.045, -3
+  %arrayidx43 = getelementptr inbounds [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64 %sub36, i64 %j.041, i64 2, i64 %sub, i64 %add3547, i64 %m.029, i64 %add34, i64 %add
+  store i64 %i.045, i64* %arrayidx43, align 8
+  %add44 = add nsw i64 %t.03, 2
+  %add45 = add nsw i64 %n, 1
+  %mul46 = mul nsw i64 %l.033, 3
+  %sub47 = add nsw i64 %mul46, -1
+  %sub48 = sub nsw i64 1, %k.037
+  %add49 = add nsw i64 %i.045, 3
+  %arrayidx57 = getelementptr inbounds [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64 %add49, i64 2, i64 %u.06, i64 %sub48, i64 %sub47, i64 %o.025, i64 %add45, i64 %add44
+  %0 = load i64* %arrayidx57, align 8
+; CHECK: da analyze - flow [-6 * * => * * * * * * * *] splitable!
+; CHECK: da analyze - split level = 3, iteration = 1!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.112, i64 1
+  store i64 %0, i64* %B.addr.112, align 8
+  %inc = add nsw i64 %t.03, 1
+  %cmp32 = icmp slt i64 %inc, %n
+  br i1 %cmp32, label %for.body33, label %for.inc58
+
+for.inc58:                                        ; preds = %for.body33, %for.cond31.preheader
+  %B.addr.11.lcssa = phi i64* [ %B.addr.105, %for.cond31.preheader ], [ %incdec.ptr, %for.body33 ]
+  %inc59 = add nsw i64 %u.06, 1
+  %cmp29 = icmp slt i64 %inc59, %n
+  br i1 %cmp29, label %for.cond31.preheader, label %for.inc61
+
+for.inc61:                                        ; preds = %for.inc58, %for.cond28.preheader
+  %B.addr.10.lcssa = phi i64* [ %B.addr.910, %for.cond28.preheader ], [ %B.addr.11.lcssa, %for.inc58 ]
+  %inc62 = add nsw i64 %s.09, 1
+  %cmp26 = icmp slt i64 %inc62, %n
+  br i1 %cmp26, label %for.cond28.preheader, label %for.inc64
+
+for.inc64:                                        ; preds = %for.inc61, %for.cond25.preheader
+  %B.addr.9.lcssa = phi i64* [ %B.addr.814, %for.cond25.preheader ], [ %B.addr.10.lcssa, %for.inc61 ]
+  %inc65 = add nsw i64 %r.013, 1
+  %cmp23 = icmp slt i64 %inc65, %n
+  br i1 %cmp23, label %for.cond25.preheader, label %for.inc67
+
+for.inc67:                                        ; preds = %for.inc64, %for.cond22.preheader
+  %B.addr.8.lcssa = phi i64* [ %B.addr.718, %for.cond22.preheader ], [ %B.addr.9.lcssa, %for.inc64 ]
+  %inc68 = add nsw i64 %q.017, 1
+  %cmp20 = icmp slt i64 %inc68, %n
+  br i1 %cmp20, label %for.cond22.preheader, label %for.inc70
+
+for.inc70:                                        ; preds = %for.inc67, %for.cond19.preheader
+  %B.addr.7.lcssa = phi i64* [ %B.addr.622, %for.cond19.preheader ], [ %B.addr.8.lcssa, %for.inc67 ]
+  %inc71 = add nsw i64 %p.021, 1
+  %cmp17 = icmp slt i64 %inc71, %n
+  br i1 %cmp17, label %for.cond19.preheader, label %for.inc73
+
+for.inc73:                                        ; preds = %for.inc70, %for.cond16.preheader
+  %B.addr.6.lcssa = phi i64* [ %B.addr.526, %for.cond16.preheader ], [ %B.addr.7.lcssa, %for.inc70 ]
+  %inc74 = add nsw i64 %o.025, 1
+  %cmp14 = icmp slt i64 %inc74, %n
+  br i1 %cmp14, label %for.cond16.preheader, label %for.inc76
+
+for.inc76:                                        ; preds = %for.inc73, %for.cond13.preheader
+  %B.addr.5.lcssa = phi i64* [ %B.addr.430, %for.cond13.preheader ], [ %B.addr.6.lcssa, %for.inc73 ]
+  %inc77 = add nsw i64 %m.029, 1
+  %cmp11 = icmp slt i64 %inc77, %n
+  br i1 %cmp11, label %for.cond13.preheader, label %for.inc79
+
+for.inc79:                                        ; preds = %for.inc76, %for.cond10.preheader
+  %B.addr.4.lcssa = phi i64* [ %B.addr.334, %for.cond10.preheader ], [ %B.addr.5.lcssa, %for.inc76 ]
+  %inc80 = add nsw i64 %l.033, 1
+  %cmp8 = icmp slt i64 %inc80, %n
+  br i1 %cmp8, label %for.cond10.preheader, label %for.inc82
+
+for.inc82:                                        ; preds = %for.inc79, %for.cond7.preheader
+  %B.addr.3.lcssa = phi i64* [ %B.addr.238, %for.cond7.preheader ], [ %B.addr.4.lcssa, %for.inc79 ]
+  %inc83 = add nsw i64 %k.037, 1
+  %cmp5 = icmp slt i64 %inc83, %n
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc85
+
+for.inc85:                                        ; preds = %for.inc82, %for.cond4.preheader
+  %B.addr.2.lcssa = phi i64* [ %B.addr.142, %for.cond4.preheader ], [ %B.addr.3.lcssa, %for.inc82 ]
+  %inc86 = add nsw i64 %j.041, 1
+  %cmp2 = icmp slt i64 %inc86, %n
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc88
+
+for.inc88:                                        ; preds = %for.inc85, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.046, %for.cond1.preheader ], [ %B.addr.2.lcssa, %for.inc85 ]
+  %inc89 = add nsw i64 %i.045, 1
+  %cmp = icmp slt i64 %inc89, %n
+  br i1 %cmp, label %for.cond1.preheader, label %for.end90
+
+for.end90:                                        ; preds = %for.inc88, %entry
+  ret void
+}
+
+
+;; cleanup around chars, shorts, ints
+;;void p4(int *A, int *B, long int n)
+;;  for (char i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i8 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = sext i8 %i.03 to i32
+  %conv3 = sext i8 %i.03 to i64
+  %add = add i64 %conv3, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %idxprom4 = sext i8 %i.03 to i64
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %idxprom4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i8 %i.03, 1
+  %conv = sext i8 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p5(int *A, int *B, long int n)
+;;  for (short i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = sext i16 %i.03 to i32
+  %conv3 = sext i16 %i.03 to i64
+  %add = add i64 %conv3, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %idxprom4 = sext i16 %i.03 to i64
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %idxprom4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i16 %i.03, 1
+  %conv = sext i16 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p6(int *A, int *B, long int n)
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom2 = sext i32 %i.03 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p7(unsigned *A, unsigned *B,  char n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p7(i32* %A, i32* %B, i8 signext %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i8 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %conv = sext i8 %n to i64
+  %add = add i64 %conv, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+
+;;void p8(unsigned *A, unsigned *B,  short n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p8(i32* %A, i32* %B, i16 signext %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i16 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %conv = sext i16 %n to i64
+  %add = add i64 %conv, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;void p9(unsigned *A, unsigned *B,  int n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p9(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;void p10(unsigned *A, unsigned *B,  unsigned n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p10(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = zext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %add = add i32 %n, 1
+  %idxprom1 = zext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Propagating.ll b/test/Analysis/DependenceAnalysis/Propagating.ll
new file mode 100644
index 0000000000000..076348c68dc8d
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Propagating.ll
@@ -0,0 +1,467 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Propagating.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 1][i + j] = i;
+;;      *B++ = A[i][i + j];
+
+define void @prop0([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc9, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add nsw i64 %i.03, %j.02
+  %add4 = add nsw i64 %i.03, 1
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %add4, i64 %add
+  store i32 %conv, i32* %arrayidx5, align 4
+  %add6 = add nsw i64 %i.03, %j.02
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add6
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - consistent flow [1 -1]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %inc10 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc10, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      for (long int k = 0; k < 100; k++)
+;;        A[j - i][i + 1][j + k] = ...
+;;        ... = A[j - i][i][j + k];
+
+define void @prop1([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc18, %entry
+  %B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc18 ]
+  %i.05 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc15, %for.cond1.preheader
+  %B.addr.14 = phi i32* [ %B.addr.06, %for.cond1.preheader ], [ %incdec.ptr, %for.inc15 ]
+  %j.03 = phi i64 [ 0, %for.cond1.preheader ], [ %inc16, %for.inc15 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.02 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %B.addr.21 = phi i32* [ %B.addr.14, %for.cond4.preheader ], [ %incdec.ptr, %for.body6 ]
+  %conv = trunc i64 %i.05 to i32
+  %add = add nsw i64 %j.03, %k.02
+  %add7 = add nsw i64 %i.05, 1
+  %sub = sub nsw i64 %j.03, %i.05
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub, i64 %add7, i64 %add
+  store i32 %conv, i32* %arrayidx9, align 4
+  %add10 = add nsw i64 %j.03, %k.02
+  %sub11 = sub nsw i64 %j.03, %i.05
+  %arrayidx14 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub11, i64 %i.05, i64 %add10
+  %0 = load i32* %arrayidx14, align 4
+; CHECK: da analyze - consistent flow [1 1 -1]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.21, i64 1
+  store i32 %0, i32* %B.addr.21, align 4
+  %inc = add nsw i64 %k.02, 1
+  %cmp5 = icmp slt i64 %inc, 100
+  br i1 %cmp5, label %for.body6, label %for.inc15
+
+for.inc15:                                        ; preds = %for.body6
+  %inc16 = add nsw i64 %j.03, 1
+  %cmp2 = icmp slt i64 %inc16, 100
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc18
+
+for.inc18:                                        ; preds = %for.inc15
+  %inc19 = add nsw i64 %i.05, 1
+  %cmp = icmp slt i64 %inc19, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end20
+
+for.end20:                                        ; preds = %for.inc18
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i - 1][2*i] = ...
+;;      ... = A[i][i + j + 110];
+
+define void @prop2([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc8, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %sub = add nsw i64 %i.03, -1
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %sub, i64 %mul
+  store i32 %conv, i32* %arrayidx4, align 4
+  %add = add nsw i64 %i.03, %j.02
+  %add5 = add nsw i64 %add, 110
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add5
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %inc9 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc9, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i][2*j + i] = ...
+;;      ... = A[i][2*j - i + 5];
+
+define void @prop3([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc9, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %j.02, 1
+  %add = add nsw i64 %mul, %i.03
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add
+  store i32 %conv, i32* %arrayidx4, align 4
+  %mul5 = shl nsw i64 %j.02, 1
+  %sub = sub nsw i64 %mul5, %i.03
+  %add6 = add nsw i64 %sub, 5
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add6
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %inc10 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc10, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;; propagate Distance
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 2][2*i + j + 1] = ...
+;;      ... = A[i][2*i + j];
+
+define void @prop4([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc11, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc11 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc12, %for.inc11 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 1
+  %add5 = add nsw i64 %i.03, 2
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %add5, i64 %add4
+  store i32 %conv, i32* %arrayidx6, align 4
+  %mul7 = shl nsw i64 %i.03, 1
+  %add8 = add nsw i64 %mul7, %j.02
+  %arrayidx10 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add8
+  %0 = load i32* %arrayidx10, align 4
+; CHECK: da analyze - consistent flow [2 -3]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc11
+
+for.inc11:                                        ; preds = %for.body3
+  %inc12 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc12, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end13
+
+for.end13:                                        ; preds = %for.inc11
+  ret void
+}
+
+
+;; propagate Point
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[3*i - 18][22 - i][2*i + j] = ...
+;;      ... = A[i][i][3*i + j];
+
+define void @prop5([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc13, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc13 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc14, %for.inc13 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add nsw i64 %mul, %j.02
+  %sub = sub nsw i64 22, %i.03
+  %mul4 = mul nsw i64 %i.03, 3
+  %sub5 = add nsw i64 %mul4, -18
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub5, i64 %sub, i64 %add
+  store i32 %conv, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i64 %i.03, 3
+  %add9 = add nsw i64 %mul8, %j.02
+  %arrayidx12 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.03, i64 %i.03, i64 %add9
+  %0 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [< -16] splitable!
+; CHECK: da analyze - split level = 1, iteration = 11!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc13
+
+for.inc13:                                        ; preds = %for.body3
+  %inc14 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc14, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.inc13
+  ret void
+}
+
+
+;; propagate Line
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 1][4*i + j + 2] = ...
+;;      ... = A[2*i][8*i + j];
+
+define void @prop6([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc12, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc12 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc13, %for.inc12 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 2
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %add5 = add nsw i64 %i.03, 1
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %add5, i64 %add4
+  store i32 %conv, i32* %arrayidx6, align 4
+  %mul7 = shl nsw i64 %i.03, 3
+  %add8 = add nsw i64 %mul7, %j.02
+  %mul9 = shl nsw i64 %i.03, 1
+  %arrayidx11 = getelementptr inbounds [100 x i32]* %A, i64 %mul9, i64 %add8
+  %0 = load i32* %arrayidx11, align 4
+; CHECK: da analyze - flow [=> -2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc12
+
+for.inc12:                                        ; preds = %for.body3
+  %inc13 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc13, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end14
+
+for.end14:                                        ; preds = %for.inc12
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i + 4][-5*i + j + 2] = ...
+;;      ... = A[-2*i + 20][5*i + j];
+
+define void @prop7([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc14, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc14 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc15, %for.inc14 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -5
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %mul5 = shl nsw i64 %i.03, 1
+  %add6 = add nsw i64 %mul5, 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %add6, i64 %add4
+  store i32 %conv, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i64 %i.03, 5
+  %add9 = add nsw i64 %mul8, %j.02
+  %mul10 = mul nsw i64 %i.03, -2
+  %add11 = add nsw i64 %mul10, 20
+  %arrayidx13 = getelementptr inbounds [100 x i32]* %A, i64 %add11, i64 %add9
+  %0 = load i32* %arrayidx13, align 4
+; CHECK: da analyze - flow [* -38] splitable!
+; CHECK: da analyze - split level = 1, iteration = 4!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body3
+  %inc15 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc15, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end16
+
+for.end16:                                        ; preds = %for.inc14
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[4][j + 2] = ...
+;;      ... = A[-2*i + 4][5*i + j];
+
+define void @prop8([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc10 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add nsw i64 %j.02, 2
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 4, i64 %add
+  store i32 %conv, i32* %arrayidx4, align 4
+  %mul = mul nsw i64 %i.03, 5
+  %add5 = add nsw i64 %mul, %j.02
+  %mul6 = mul nsw i64 %i.03, -2
+  %add7 = add nsw i64 %mul6, 4
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %A, i64 %add7, i64 %add5
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - flow [p<= 2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:                                        ; preds = %for.body3
+  %inc11 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc11, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.inc10
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i + 4][5*i + j + 2] = ...
+;;      ... = A[4][j];
+
+define void @prop9([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc10 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %mul5 = shl nsw i64 %i.03, 1
+  %add6 = add nsw i64 %mul5, 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %add6, i64 %add4
+  store i32 %conv, i32* %arrayidx7, align 4
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %A, i64 4, i64 %j.02
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - flow [p<= 2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:                                        ; preds = %for.body3
+  %inc11 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc11, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.inc10
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Separability.ll b/test/Analysis/DependenceAnalysis/Separability.ll
new file mode 100644
index 0000000000000..d42d3cdb39e5e
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Separability.ll
@@ -0,0 +1,267 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Separability.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[n][i][j + k] = ...
+;;          ... = A[10][i + 10][2*j - l];
+
+define void @sep0([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc22, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc22 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc23, %for.inc22 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc19, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc19 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc20, %for.inc19 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc16, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc16 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc17, %for.inc16 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %j.05, %k.03
+  %idxprom = sext i32 %n to i64
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %idxprom, i64 %i.07, i64 %add
+  store i32 %conv, i32* %arrayidx11, align 4
+  %mul = shl nsw i64 %j.05, 1
+  %sub = sub nsw i64 %mul, %l.02
+  %add12 = add nsw i64 %i.07, 10
+  %arrayidx15 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 10, i64 %add12, i64 %sub
+  %0 = load i32* %arrayidx15, align 4
+; CHECK: da analyze - flow [-10 * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc16
+
+for.inc16:                                        ; preds = %for.body9
+  %inc17 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc17, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc19
+
+for.inc19:                                        ; preds = %for.inc16
+  %inc20 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc20, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc22
+
+for.inc22:                                        ; preds = %for.inc19
+  %inc23 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc23, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end24
+
+for.end24:                                        ; preds = %for.inc22
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][j + k] = ...
+;;          ... = A[10][i + 10][2*j - l];
+
+define void @sep1([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc22, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc22 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc23, %for.inc22 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc19, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc19 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc20, %for.inc19 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc16, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc16 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc17, %for.inc16 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %j.05, %k.03
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.07, i64 %i.07, i64 %add
+  store i32 %conv, i32* %arrayidx11, align 4
+  %mul = shl nsw i64 %j.05, 1
+  %sub = sub nsw i64 %mul, %l.02
+  %add12 = add nsw i64 %i.07, 10
+  %arrayidx15 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 10, i64 %add12, i64 %sub
+  %0 = load i32* %arrayidx15, align 4
+; CHECK: da analyze - flow [> * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc16
+
+for.inc16:                                        ; preds = %for.body9
+  %inc17 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc17, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc19
+
+for.inc19:                                        ; preds = %for.inc16
+  %inc20 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc20, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc22
+
+for.inc22:                                        ; preds = %for.inc19
+  %inc23 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc23, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end24
+
+for.end24:                                        ; preds = %for.inc22
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][i + k][l] = ...
+;;          ... = A[10][i + 10][j + k][l + 10];
+
+define void @sep2([100 x [100 x [100 x i32]]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc26, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc26 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc27, %for.inc26 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc23, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc23 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc24, %for.inc23 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc20, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc20 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc21, %for.inc20 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %i.07, %k.03
+  %arrayidx12 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 %i.07, i64 %i.07, i64 %add, i64 %l.02
+  store i32 %conv, i32* %arrayidx12, align 4
+  %add13 = add nsw i64 %l.02, 10
+  %add14 = add nsw i64 %j.05, %k.03
+  %add15 = add nsw i64 %i.07, 10
+  %arrayidx19 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 10, i64 %add15, i64 %add14, i64 %add13
+  %0 = load i32* %arrayidx19, align 4
+; CHECK: da analyze - flow [> * * -10]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc20
+
+for.inc20:                                        ; preds = %for.body9
+  %inc21 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc21, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc23
+
+for.inc23:                                        ; preds = %for.inc20
+  %inc24 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc24, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc26
+
+for.inc26:                                        ; preds = %for.inc23
+  %inc27 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc27, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end28
+
+for.end28:                                        ; preds = %for.inc26
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][i + k][l + k] = ...
+;;          ... = A[10][i + 10][j + k][l + 10];
+
+define void @sep3([100 x [100 x [100 x i32]]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc27, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc27 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc28, %for.inc27 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc24, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc24 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc25, %for.inc24 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc21, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc21 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc22, %for.inc21 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %l.02, %k.03
+  %add10 = add nsw i64 %i.07, %k.03
+  %arrayidx13 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 %i.07, i64 %i.07, i64 %add10, i64 %add
+  store i32 %conv, i32* %arrayidx13, align 4
+  %add14 = add nsw i64 %l.02, 10
+  %add15 = add nsw i64 %j.05, %k.03
+  %add16 = add nsw i64 %i.07, 10
+  %arrayidx20 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 10, i64 %add16, i64 %add15, i64 %add14
+  %0 = load i32* %arrayidx20, align 4
+; CHECK: da analyze - flow [> * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc21
+
+for.inc21:                                        ; preds = %for.body9
+  %inc22 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc22, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc21
+  %inc25 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc25, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %inc28 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc28, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end29
+
+for.end29:                                        ; preds = %for.inc27
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/StrongSIV.ll b/test/Analysis/DependenceAnalysis/StrongSIV.ll
new file mode 100644
index 0000000000000..be336c3580ceb
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/StrongSIV.ll
@@ -0,0 +1,342 @@
+; RUN: opt < %s -analyze -basicaa -indvars -da | FileCheck %s
+
+; ModuleID = 'StrongSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom2 = sext i32 %i.03 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong1(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %conv = sext i32 %n to i64
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = trunc i64 %i.03 to i32
+  %add = add nsw i64 %i.03, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %i.03
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.03
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong3(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %i.03 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 19; i++)
+;;    A[i + 19] = ...
+;;    ... = A[i];
+
+define void @strong4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 19
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[i + 19] = ...
+;;    ... = A[i];
+
+define void @strong5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 19
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [19]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[2*i + 6] = ...
+;;    ... = A[2*i];
+
+define void @strong6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %mul1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow [3]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[2*i + 7] = ...
+;;    ... = A[2*i];
+
+define void @strong7(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %mul1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[i + n] = ...
+;;    ... = A[i];
+
+define void @strong8(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [%n|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + n] = ...
+;;    ... = A[i + 2*n];
+
+define void @strong9(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %n, 1
+  %add1 = add i64 %i.03, %mul
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 1000; i++)
+;;    A[n*i + 5] = ...
+;;    ... = A[n*i + 5];
+
+define void @strong10(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, %n
+  %add = add i64 %mul, 5
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = mul i64 %i.02, %n
+  %add2 = add i64 %mul1, 5
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
new file mode 100644
index 0000000000000..2a1b4e7e971df
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
@@ -0,0 +1,312 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'SymbolicRDIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i + n1] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[3*j + 3*n1];
+
+define void @symbolicrdiv0(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end11, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %add = add i64 %mul, %n1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc10, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul56 = add i64 %j.03, %n1
+  %add7 = mul i64 %mul56, 3
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc10 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc10, %n2
+  br i1 %cmp2, label %for.body4, label %for.end11
+
+for.end11:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i + 5*n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[3*j + 2*n2];
+
+define void @symbolicrdiv1(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond2.preheader, label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body, %entry
+  %cmp31 = icmp eq i64 %n2, 0
+  br i1 %cmp31, label %for.end12, label %for.body5
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %mul1 = mul i64 %n2, 5
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond2.preheader
+
+for.body5:                                        ; preds = %for.body5, %for.cond2.preheader
+  %j.03 = phi i64 [ %inc11, %for.body5 ], [ 0, %for.cond2.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body5 ], [ %B, %for.cond2.preheader ]
+  %mul6 = mul nsw i64 %j.03, 3
+  %mul7 = shl i64 %n2, 1
+  %add8 = add i64 %mul6, %mul7
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %add8
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc11 = add nsw i64 %j.03, 1
+  %cmp3 = icmp ult i64 %inc11, %n2
+  br i1 %cmp3, label %for.body5, label %for.end12
+
+for.end12:                                        ; preds = %for.body5, %for.cond2.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i - n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + 2*n1];
+
+define void @symbolicrdiv2(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %sub = sub i64 %mul, %n2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul6 = shl i64 %n1, 1
+  %add = sub i64 %mul6, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[j - n1];
+
+define void @symbolicrdiv3(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end9, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %add = sub i64 %n2, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc8, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %sub5 = sub i64 %j.03, %n1
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %sub5
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc8 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc8, %n2
+  br i1 %cmp2, label %for.body4, label %for.end9
+
+for.end9:                                         ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + 2*n1] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + n1];
+
+define void @symbolicrdiv4(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl i64 %n1, 1
+  %add = sub i64 %mul, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %add6 = sub i64 %n1, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + 2*n2];
+
+define void @symbolicrdiv5(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %add = sub i64 %n2, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul = shl i64 %n2, 1
+  %add6 = sub i64 %mul, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    for (long int j = 0; j < n2; j++)
+;;      A[j -i + n2] = ...
+;;      ... = A[2*n2];
+
+define void @symbolicrdiv6(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.end7, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.06 = phi i32* [ %B.addr.1.lcssa, %for.inc5 ], [ %B, %entry ]
+  %i.05 = phi i64 [ %inc6, %for.inc5 ], [ 0, %entry ]
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.inc5, label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 0, %for.cond1.preheader ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.06, %for.cond1.preheader ]
+  %conv = trunc i64 %i.05 to i32
+  %sub = sub nsw i64 %j.03, %i.05
+  %add = add i64 %sub, %n2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %n2, 1
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %0, i32* %B.addr.12, align 4
+  %inc = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc, %n2
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.06, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %inc6 = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc6, %n1
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/SymbolicSIV.ll b/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
new file mode 100644
index 0000000000000..ee2343fa51e92
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
@@ -0,0 +1,330 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'SymbolicSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i + n] = ...
+;;    ... = A[3*i + 3*n];
+
+define void @symbolicsiv0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add i64 %mul, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul14 = add i64 %i.03, %n
+  %add3 = mul i64 %mul14, 3
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %add3
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i + 5*n] = ...
+;;    ... = A[3*i + 2*n];
+
+define void @symbolicsiv1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul1 = mul i64 %n, 5
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = mul nsw i64 %i.03, 3
+  %mul3 = shl i64 %n, 1
+  %add4 = add i64 %mul2, %mul3
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %add4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i - n] = ...
+;;    ... = A[-i + 2*n];
+
+define void @symbolicsiv2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %sub = sub i64 %mul, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %n, 1
+  %add = sub i64 %mul2, %i.03
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i + n + 1] = ...
+;;    ... = A[i - 2*n];
+
+define void @symbolicsiv3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %add = add i64 %mul, %n
+  %add1 = add i64 %add, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add1
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %n, 1
+  %sub = sub i64 %i.03, %mul2
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i + 3*n] = ...
+;;    ... = A[-i + n];
+
+define void @symbolicsiv4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %mul1 = mul i64 %n, 3
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %add2 = sub i64 %n, %i.03
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i - 2*n] = ...
+;;    ... = A[-i - n];
+
+define void @symbolicsiv5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %mul1 = shl i64 %n, 1
+  %sub = sub i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub2 = sub nsw i64 0, %i.03
+  %sub3 = sub i64 %sub2, %n
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %sub3
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; why doesn't SCEV package understand that n >= 0?
+;;void weaktest(int *A, int *B, long unsigned n)
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + n + 1] = ...
+;;    ... = A[-i];
+
+define void @weaktest(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %add1 = add i64 %add, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add1
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 0, %i.03
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [*|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = ((0 smax (-1 + (-1 * %n))) /u 2)!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  void symbolicsiv6(int *A, int *B, long unsigned n, long unsigned N, long unsigned M) {
+;;    for (long int i = 0; i < n; i++) {
+;;      A[4*N*i + M] = i;
+;;      *B++ = A[4*N*i + 3*M + 1];
+
+define void @symbolicsiv6(i32* %A, i32* %B, i64 %n, i64 %N, i64 %M) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %for.body.preheader ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl i64 %N, 2
+  %mul1 = mul i64 %mul, %i.03
+  %add = add i64 %mul1, %M
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %N, 2
+  %mul3 = mul i64 %mul2, %i.03
+  %mul4 = mul i64 %M, 3
+  %add5 = add i64 %mul3, %mul4
+  %add6 = add i64 %add5, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+
+;;  void symbolicsiv7(int *A, int *B, long unsigned n, long unsigned N, long unsigned M) {
+;;    for (long int i = 0; i < n; i++) {
+;;      A[2*N*i + M] = i;
+;;      *B++ = A[2*N*i - 3*M + 2];
+
+define void @symbolicsiv7(i32* %A, i32* %B, i64 %n, i64 %N, i64 %M) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %for.body.preheader ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl i64 %N, 1
+  %mul1 = mul i64 %mul, %i.03
+  %add = add i64 %mul1, %M
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %N, 1
+  %mul3 = mul i64 %mul2, %i.03
+  %0 = mul i64 %M, -3
+  %sub = add i64 %mul3, %0
+  %add5 = add i64 %sub, 2
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %add5
+  %1 = load i32* %arrayidx6, align 4
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+; CHECK: da analyze - flow [<>]!
+  store i32 %1, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll b/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll
new file mode 100644
index 0000000000000..343e8f49bf9e7
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll
@@ -0,0 +1,220 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakCrossingSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[1 + n*i] = ...
+;;    ... = A[1 - n*i];
+
+define void @weakcrossing0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = mul i64 %i.03, %n
+  %sub = sub i64 1, %mul1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[n + i] = ...
+;;    ... = A[1 + n - i];
+
+define void @weakcrossing1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %add1 = add i64 %n, 1
+  %sub = sub i64 %add1, %i.03
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [<>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 0!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 3; i++)
+;;    A[i] = ...
+;;    ... = A[6 - i];
+
+define void @weakcrossing2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 3
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 4; i++)
+;;    A[i] = ...
+;;    ... = A[6 - i];
+
+define void @weakcrossing3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++)
+;;    A[i] = ...
+;;    ... = A[-6 - i];
+
+define void @weakcrossing4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 -6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[3*i] = ...
+;;    ... = A[5 - 3*i];
+
+define void @weakcrossing5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, 3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %0 = mul i64 %i.03, -3
+  %sub = add i64 %0, 5
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %1 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %1, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 4; i++)
+;;    A[i] = ...
+;;    ... = A[5 - i];
+
+define void @weakcrossing6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 5, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [<>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 2!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll b/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll
new file mode 100644
index 0000000000000..a59871602b6cc
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakZeroDstSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 30; i++)
+;;    A[2*i + 10] = ...
+;;    ... = A[10];
+
+define void @weakzerodst0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[n*i + 10] = ...
+;;    ... = A[10];
+
+define void @weakzerodst1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 5; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 6; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>p|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[2*i] = ...
+;;    ... = A[-10];
+
+define void @weakzerodst5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 -10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[3*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, 3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll b/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll
new file mode 100644
index 0000000000000..fd4f462695464
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakZeroSrcSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 30; i++)
+;;    A[10] = ...
+;;    ... = A[2*i + 10];
+
+define void @weakzerosrc0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 10
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[10] = ...
+;;    ... = A[n*i + 10];
+
+define void @weakzerosrc1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 10
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 5; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 6; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>p|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[-10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 -10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[10] = ...
+;;    ... = A[3*i];
+
+define void @weakzerosrc6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = mul i64 %i.03, 3
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ZIV.ll b/test/Analysis/DependenceAnalysis/ZIV.ll
new file mode 100644
index 0000000000000..42b2389df268b
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ZIV.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ZIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  A[n + 1] = ...
+;;  ... = A[1 + n];
+
+define void @z0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %add = add i64 %n, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 0, i32* %arrayidx, align 4
+  %add1 = add i64 %n, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @z1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %arrayidx = getelementptr inbounds i32* %A, i64 %n
+  store i32 0, i32* %arrayidx, align 4
+  %add = add i64 %n, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;  A[n] = ...
+;;  ... = A[m];
+
+define void @z2(i32* %A, i32* %B, i64 %n, i64 %m) nounwind uwtable ssp {
+entry:
+  %arrayidx = getelementptr inbounds i32* %A, i64 %n
+  store i32 0, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %m
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/lit.local.cfg b/test/Analysis/DependenceAnalysis/lit.local.cfg
new file mode 100644
index 0000000000000..c6106e4746f2d
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Analysis/LoopDependenceAnalysis/alias.ll b/test/Analysis/LoopDependenceAnalysis/alias.ll
deleted file mode 100644
index 78d0bf4fee1ab..0000000000000
--- a/test/Analysis/LoopDependenceAnalysis/alias.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-;; x[5] = x[6] // with x being a pointer passed as argument
-
-define void @f1(i32* nocapture %xptr) nounwind {
-entry:
-  %x.ld.addr = getelementptr i32* %xptr, i64 6
-  %x.st.addr = getelementptr i32* %xptr, i64 5
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: dep
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[5] = x[6] // with x being an array on the stack
-
-define void @foo(...) nounwind {
-entry:
-  %xptr = alloca [256 x i32], align 4
-  %x.ld.addr = getelementptr [256 x i32]* %xptr, i64 0, i64 6
-  %x.st.addr = getelementptr [256 x i32]* %xptr, i64 0, i64 5
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-strong.ll b/test/Analysis/LoopDependenceAnalysis/siv-strong.ll
deleted file mode 100644
index 401e466d6669d..0000000000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-strong.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[i] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.addr      ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 256; i++)
-;;   x[i+1] = x[i] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %i.next = add i64 %i, 1
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.next
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 10; i++)
-;;   x[i+20] = x[i] + y[i]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %i.20 = add i64 %i, 20
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.20
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 10; i++)
-;;   x[10*i+1] = x[10*i] + y[i]
-
-define void @f4(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.10 = mul i64 %i, 10
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i.10
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.10
-  %i.10.1 = add i64 %i.10, 1
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.10.1
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll b/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll
deleted file mode 100644
index 9d0128c5fec4a..0000000000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[255 - i] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.255 = sub i64 255, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.255
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 100; i++)
-;;   x[i] = x[255 - i] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.255 = sub i64 255, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.255
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; // the first iteration (i=0) leads to an out-of-bounds access of x. as the
-;; // result of this access is undefined, _any_ dependence result is safe.
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[256 - i] + y[i]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.256 = sub i64 0, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 1, i64 %i.256
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2:
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; // slightly contrived but valid IR for the following loop, where all
-;; // accesses in all iterations are within bounds. while this example's first
-;; // (ZIV-)subscript is (0, 1), accesses are dependent.
-;; for (i = 1; i < 256; i++)
-;;   x[i] = x[256 - i] + y[i]
-
-define void @f4(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.1 = add i64 1, %i
-  %i.256 = sub i64 -1, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i.1
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 1, i64 %i.256
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.1
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll b/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll
deleted file mode 100644
index 1c5ae4c490e34..0000000000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[42] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 42
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x = load i32* %x.ld.addr   ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 250; i++)
-;;   x[i] = x[255] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 255
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x = load i32* %x.ld.addr   ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 250
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/ziv.ll b/test/Analysis/LoopDependenceAnalysis/ziv.ll
deleted file mode 100644
index 645ae7f152e2f..0000000000000
--- a/test/Analysis/LoopDependenceAnalysis/ziv.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-
-;; x[5] = x[6]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-  store i32 %x, i32* getelementptr ([256 x i32]* @x, i32 0, i64 5)
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[c] = x[c+1] // with c being a loop-invariant constant
-
-define void @f2(i64 %c0) nounwind {
-entry:
-  %c1 = add i64 %c0, 1
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %c0
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %c1
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[6] = x[6]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-  store i32 %x, i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-; CHECK: 0,1: dep
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/Profiling/load-branch-weights-ifs.ll b/test/Analysis/Profiling/load-branch-weights-ifs.ll
new file mode 100644
index 0000000000000..7ed090b7c366a
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-ifs.ll
@@ -0,0 +1,122 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_mod - Branch taken 6 times in 7.
+define i32 @func_mod(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 7
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.else
+; CHECK: br i1 %tobool, label %if.then, label %if.else, !prof !0
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.else:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% taken probability.
+define i32 @func_const_true(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% not-taken probability.
+define i32 @func_const_false(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !2
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 7000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !3
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_mod(i32 %1)
+  br label %for.inc
+
+for.inc:
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %call1 = call i32 @func_const_true(i32 1)
+  %call2 = call i32 @func_const_false(i32 0)
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 6000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 0}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7000, i32 1}
+; CHECK-NOT: !4
diff --git a/test/Analysis/Profiling/load-branch-weights-loops.ll b/test/Analysis/Profiling/load-branch-weights-loops.ll
new file mode 100644
index 0000000000000..9d1925a2d7016
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-loops.ll
@@ -0,0 +1,188 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_for - Test branch probabilities for a vanilla for loop.
+define i32 @func_for(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !0
+
+for.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_for_odd - Test branch probabilities for a for loop with a continue and
+;; a break.
+define i32 @func_for_odd(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+  %2 = load i32* %loop, align 4
+  %rem = srem i32 %2, 10
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; CHECK: br i1 %tobool, label %if.then, label %if.end, !prof !2
+
+if.then:
+  br label %for.inc
+
+if.end:
+  %3 = load i32* %loop, align 4
+  %cmp1 = icmp eq i32 %3, 500
+  br i1 %cmp1, label %if.then2, label %if.end3
+; CHECK: br i1 %cmp1, label %if.then2, label %if.end3, !prof !3
+
+if.then2:
+  br label %for.end
+
+if.end3:
+  %4 = load i32* %N.addr, align 4
+  %5 = load i32* %ret, align 4
+  %add = add nsw i32 %5, %4
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %6 = load i32* %loop, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %7 = load i32* %ret, align 4
+  ret i32 %7
+}
+
+;; func_while - Test branch probability in a vanilla while loop.
+define i32 @func_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %while.cond
+
+while.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %while.body, label %while.end
+; CHECK: br i1 %cmp, label %while.body, label %while.end, !prof !0
+
+while.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %while.cond
+
+while.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_while - Test branch probability in a vanilla do-while loop.
+define i32 @func_do_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %do.body
+
+do.body:
+  %0 = load i32* %N.addr, align 4
+  %1 = load i32* %ret, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %ret, align 4
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %do.cond
+
+do.cond:
+  %3 = load i32* %loop, align 4
+  %4 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %3, %4
+  br i1 %cmp, label %do.body, label %do.end
+; CHECK: br i1 %cmp, label %do.body, label %do.end, !prof !4
+
+do.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  %call = call i32 @func_for(i32 1000)
+  %call1 = call i32 @func_for_odd(i32 1000)
+  %call2 = call i32 @func_while(i32 1000)
+  %call3 = call i32 @func_do_while(i32 1000)
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 501, i32 0}
+!2 = metadata !{metadata !"branch_weights", i32 450, i32 51}
+!3 = metadata !{metadata !"branch_weights", i32 1, i32 50}
+!4 = metadata !{metadata !"branch_weights", i32 999, i32 1}
+; CHECK-NOT: !5
diff --git a/test/Analysis/Profiling/load-branch-weights-switches.ll b/test/Analysis/Profiling/load-branch-weights-switches.ll
new file mode 100644
index 0000000000000..5587c7172bb6c
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-switches.ll
@@ -0,0 +1,165 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_switch - Test branch probabilities for a switch instruction with an
+;; even chance of taking each case (or no case).
+define i32 @func_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 4
+  switch i32 %rem, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+; CHECK: ], !prof !0
+
+sw.bb:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb1:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb2:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.epilog:
+  store i32 8, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_switch_switch - Test branch probabilities in a switch-instruction that
+;; leads to further switch instructions.  The first-tier switch occludes some
+;; possibilities in the second-tier switches, leading to some branches having a
+;; 0 probability.
+define i32 @func_switch_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 2
+  switch i32 %rem, label %sw.default11 [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb5
+  ]
+; CHECK: ], !prof !1
+
+sw.bb:
+  %1 = load i32* %N.addr, align 4
+  %rem1 = srem i32 %1, 4
+  switch i32 %rem1, label %sw.default [
+    i32 0, label %sw.bb2
+    i32 1, label %sw.bb3
+    i32 2, label %sw.bb4
+  ]
+; CHECK: ], !prof !2
+
+sw.bb2:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb3:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb4:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.default:
+  store i32 8, i32* %retval
+  br label %return
+
+sw.bb5:
+  %2 = load i32* %N.addr, align 4
+  %rem6 = srem i32 %2, 4
+  switch i32 %rem6, label %sw.default10 [
+    i32 0, label %sw.bb7
+    i32 1, label %sw.bb8
+    i32 2, label %sw.bb9
+  ]
+; CHECK: ], !prof !3
+
+sw.bb7:
+  store i32 9, i32* %retval
+  br label %return
+
+sw.bb8:
+  store i32 10, i32* %retval
+  br label %return
+
+sw.bb9:
+  store i32 11, i32* %retval
+  br label %return
+
+sw.default10:
+  store i32 12, i32* %retval
+  br label %return
+
+sw.default11:
+  store i32 13, i32* %retval
+  br label %return
+
+return:
+  %3 = load i32* %retval
+  ret i32 %3
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 4000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !4
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_switch(i32 %1)
+  %2 = load i32* %loop, align 4
+  %call1 = call i32 @func_switch_switch(i32 %2)
+  br label %for.inc
+
+for.inc:
+  %3 = load i32* %loop, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 1000, i32 1000, i32 1000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 0, i32 2000, i32 2000}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1000, i32 0, i32 1000}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 1000, i32 0, i32 1000, i32 0}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 4000, i32 1}
+; CHECK-NOT: !5
diff --git a/test/Assembler/2008-09-02-FunctionNotes2.ll b/test/Assembler/2008-09-02-FunctionNotes2.ll
index 97351e2a5713e..47eb011343fbe 100644
--- a/test/Assembler/2008-09-02-FunctionNotes2.ll
+++ b/test/Assembler/2008-09-02-FunctionNotes2.ll
@@ -1,5 +1,5 @@
 ; Test function notes
-; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes noinline alwaysinline are incompatible"
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes 'noinline and alwaysinline' are incompatible"
 define void @fn1() alwaysinline  noinline {
   ret void
 }
diff --git a/test/Assembler/global-addrspace-forwardref.ll b/test/Assembler/global-addrspace-forwardref.ll
new file mode 100644
index 0000000000000..f0f094a2248dd
--- /dev/null
+++ b/test/Assembler/global-addrspace-forwardref.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Make sure the address space of forward decls is preserved
+
+; CHECK: @a2 = global i8 addrspace(1)* @a
+; CHECK: @a = addrspace(1) global i8 0
+@a2 = global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 0
diff --git a/test/Assembler/invalid-fwdref1.ll b/test/Assembler/invalid-fwdref1.ll
new file mode 100644
index 0000000000000..ef8b16cadceb3
--- /dev/null
+++ b/test/Assembler/invalid-fwdref1.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | grep "invalid forward reference to function as global value!"
+
+define i8* @test1() { ret i8* @test1a }
+define void @test1a() { }
diff --git a/test/Bindings/Ocaml/ipo_opts.ml b/test/Bindings/Ocaml/ipo_opts.ml
index 3a362319a731b..d4537e4413fbf 100644
--- a/test/Bindings/Ocaml/ipo_opts.ml
+++ b/test/Bindings/Ocaml/ipo_opts.ml
@@ -43,10 +43,10 @@ let test_transforms () =
       ignore (build_ret (build_call fn [| |] "" b) b);
   end;
 
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   
   ignore (PassManager.create ()
-           ++ TargetData.add td
+           ++ DataLayout.add td
            ++ add_argument_promotion
            ++ add_constant_merge
            ++ add_dead_arg_elimination
@@ -63,7 +63,7 @@ let test_transforms () =
            ++ PassManager.run_module m
            ++ PassManager.dispose);
 
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/scalar_opts.ml b/test/Bindings/Ocaml/scalar_opts.ml
index 34a7a6a01bd0f..0760dad4ad023 100644
--- a/test/Bindings/Ocaml/scalar_opts.ml
+++ b/test/Bindings/Ocaml/scalar_opts.ml
@@ -38,10 +38,10 @@ let test_transforms () =
   let fn = define_function "fn" fty m in
   ignore (build_ret_void (builder_at_end context (entry_block fn)));
   
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   
   ignore (PassManager.create_function m
-           ++ TargetData.add td
+           ++ DataLayout.add td
            ++ add_verifier
            ++ add_constant_propagation
            ++ add_sccp
@@ -78,7 +78,7 @@ let test_transforms () =
            ++ PassManager.finalize
            ++ PassManager.dispose);
   
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
index 1b6b71e2759bf..7a35a790ab3af 100644
--- a/test/Bindings/Ocaml/target.ml
+++ b/test/Bindings/Ocaml/target.ml
@@ -33,10 +33,10 @@ let m = create_module context filename
 (*===-- Target Data -------------------------------------------------------===*)
 
 let test_target_data () =
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   let sty = struct_type context [| i32_type; i64_type |] in
   
-  ignore (TargetData.as_string td);
+  ignore (DataLayout.as_string td);
   ignore (byte_order td);
   ignore (pointer_size td);
   ignore (intptr_type td);
@@ -49,7 +49,7 @@ let test_target_data () =
   ignore (element_at_offset td sty (Int64.of_int 1));
   ignore (offset_of_element td sty 1);
   
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index b8eb6d3e3dd1d..61be4b7703587 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml
@@ -113,14 +113,14 @@ let test_constants () =
   ignore (define_global "const_int_string" c m);
   insist (i32_type = type_of c);
 
-  (* RUN: grep 'const_string.*"cruel\00world"' < %t.ll
+  (* RUN: grep 'const_string.*"cruel\\00world"' < %t.ll
    *)
   group "string";
   let c = const_string context "cruel\000world" in
   ignore (define_global "const_string" c m);
   insist ((array_type i8_type 11) = type_of c);
 
-  (* RUN: grep 'const_stringz.*"hi\00again\00"' < %t.ll
+  (* RUN: grep 'const_stringz.*"hi\\00again\\00"' < %t.ll
    *)
   group "stringz";
   let c = const_stringz context "hi\000again" in
@@ -187,7 +187,7 @@ let test_constants () =
   ignore (define_global "const_all_ones" c m);
 
   group "pointer null"; begin
-    (* RUN: grep "const_pointer_null = global i64* null" < %t.ll
+    (* RUN: grep "const_pointer_null = global i64\* null" < %t.ll
      *)
     let c = const_pointer_null (pointer_type i64_type) in
     ignore (define_global "const_pointer_null" c m);
@@ -542,7 +542,7 @@ let test_users () =
 (*===-- Aliases -----------------------------------------------------------===*)
 
 let test_aliases () =
-  (* RUN: grep "@alias = alias i32* @aliasee" < %t.ll
+  (* RUN: grep "@alias = alias i32\* @aliasee" < %t.ll
    *)
   let v = declare_global i32_type "aliasee" m in
   ignore (add_alias m (pointer_type i32_type) v "alias")
@@ -554,7 +554,7 @@ let test_functions () =
   let ty = function_type i32_type [| i32_type; i64_type |] in
   let ty2 = function_type i8_type [| i8_type; i64_type |] in
   
-  (* RUN: grep "declare i32 @Fn1\(i32, i64\)" < %t.ll
+  (* RUN: grep 'declare i32 @Fn1(i32, i64)' < %t.ll
    *)
   begin group "declare";
     insist (None = lookup_function "Fn1" m);
@@ -935,7 +935,7 @@ let test_builder () =
 
   group "malloc/free"; begin
       (* RUN: grep "call.*@malloc(i32 ptrtoint" < %t.ll
-       * RUN: grep "call.*@free(i8*" < %t.ll
+       * RUN: grep "call.*@free(i8\*" < %t.ll
        * RUN: grep "call.*@malloc(i32 %" < %t.ll
        *)
       let bb1 = append_block context "MallocBlock1" fn in
@@ -947,7 +947,7 @@ let test_builder () =
   end;
 
   group "indirectbr"; begin
-    (* RUN: grep "indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]" < %t.ll
+    (* RUN: grep "indirectbr i8\* blockaddress(@X7, %IBRBlock2), \[label %IBRBlock2, label %IBRBlock3\]" < %t.ll
      *)
     let bb1 = append_block context "IBRBlock1" fn in
 
@@ -1054,10 +1054,10 @@ let test_builder () =
 
     (* RUN: grep "%build_alloca = alloca i32" < %t.ll
      * RUN: grep "%build_array_alloca = alloca i32, i32 %P2" < %t.ll
-     * RUN: grep "%build_load = load i32* %build_array_alloca" < %t.ll
-     * RUN: grep "store i32 %P2, i32* %build_alloca" < %t.ll
-     * RUN: grep "%build_gep = getelementptr i32* %build_array_alloca, i32 %P2" < %t.ll
-     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_load = load i32\* %build_array_alloca" < %t.ll
+     * RUN: grep "store i32 %P2, i32\* %build_alloca" < %t.ll
+     * RUN: grep "%build_gep = getelementptr i32\* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32\* %build_array_alloca, i32 %P2" < %t.ll
      * RUN: grep "%build_struct_gep = getelementptr inbounds.*%build_alloca2, i32 0, i32 1" < %t.ll
      *)
     let alloca = build_alloca i32_type "build_alloca" b in
@@ -1106,14 +1106,14 @@ let test_builder () =
      * RUN: grep "%build_fptrunc2 = fptrunc double %build_sitofp to float" < %t.ll
      * RUN: grep "%build_fpext = fpext float %build_fptrunc to double" < %t.ll
      * RUN: grep "%build_fpext2 = fpext float %build_fptrunc to double" < %t.ll
-     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8*" < %t.ll
-     * RUN: grep "%build_ptrtoint = ptrtoint i8* %build_inttoptr to i64" < %t.ll
-     * RUN: grep "%build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8\*" < %t.ll
+     * RUN: grep "%build_ptrtoint = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_ptrtoint2 = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
      * RUN: grep "%build_bitcast = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast2 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast3 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast4 = bitcast i64 %build_ptrtoint to double" < %t.ll
-     * RUN: grep "%build_pointercast = bitcast i8* %build_inttoptr to i16*" < %t.ll
+     * RUN: grep "%build_pointercast = bitcast i8\* %build_inttoptr to i16*" < %t.ll
      *)
     let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
     let inst29 = build_zext inst28 i32_type "build_zext" atentry in
@@ -1148,7 +1148,7 @@ let test_builder () =
      * RUN: grep "%build_fcmp_false = fcmp false float %F1, %F2" < %t.ll
      * RUN: grep "%build_fcmp_true = fcmp true float %F2, %F1" < %t.ll
      * RUN: grep "%build_is_null.*= icmp eq.*%X0,.*null" < %t.ll
-     * RUN: grep "%build_is_not_null = icmp ne i8* %X1, null" < %t.ll
+     * RUN: grep "%build_is_not_null = icmp ne i8\* %X1, null" < %t.ll
      * RUN: grep "%build_ptrdiff" < %t.ll
      *)
     ignore (build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry);
@@ -1167,7 +1167,7 @@ let test_builder () =
   group "miscellaneous"; begin
     (* RUN: grep "%build_call = tail call cc63 i32 @.*(i32 signext %P2, i32 %P1)" < %t.ll
      * RUN: grep "%build_select = select i1 %build_icmp, i32 %P1, i32 %P2" < %t.ll
-     * RUN: grep "%build_va_arg = va_arg i8** null, i32" < %t.ll
+     * RUN: grep "%build_va_arg = va_arg i8\*\* null, i32" < %t.ll
      * RUN: grep "%build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2" < %t.ll
      * RUN: grep "%build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2" < %t.ll
      * RUN: grep "%build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>" < %t.ll
@@ -1240,8 +1240,8 @@ let test_builder () =
   end;
 
   group "dbg"; begin
-    (* RUN: grep "%dbg = add i32 %P1, %P2, !dbg !1" < %t.ll
-     * RUN: grep "!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}" < %t.ll
+    (* RUN: grep '%dbg = add i32 %P1, %P2, !dbg !1' < %t.ll
+     * RUN: grep '!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}' < %t.ll
      *)
     insist ((current_debug_location atentry) = None);
 
diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
index b9f334176caae..8ac54be00d541 100644
--- a/test/Bitcode/blockaddress.ll
+++ b/test/Bitcode/blockaddress.ll
@@ -28,3 +28,18 @@ here:
 end:
   ret void
 }
+
+; PR13895
+define void @doitagain(i8** nocapture %pptr) {
+; CHECK: define void @doitagain
+entry:
+  br label %here
+
+here:
+  store i8* blockaddress(@doit, %here), i8** %pptr, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll
new file mode 100644
index 0000000000000..aedb0c32676f8
--- /dev/null
+++ b/test/Bitcode/function-encoding-rel-operands.ll
@@ -0,0 +1,49 @@
+; Basic sanity test to check that instruction operands are encoded with
+; relative IDs.
+; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
+
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_RET {{.*}}op0=1
+define i32 @test_int_binops(i32 %a) nounwind {
+entry:
+  %0 = add i32 %a, %a
+  %1 = sub i32 %0, %0
+  %2 = mul i32 %1, %1
+  ret i32 %2
+}
+
+
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_CAST {{.*}}op0=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_RET {{.*}}op0=1
+define double @test_float_binops(i32 %a) nounwind {
+  %1 = sitofp i32 %a to double
+  %2 = fadd double %1, %1
+  %3 = fsub double %2, %2
+  %4 = fmul double %3, %3
+  %5 = fdiv double %4, %4
+  ret double %5
+}
+
+
+; CHECK: FUNCTION_BLOCK
+; skip checking operands of INST_INBOUNDS_GEP since that depends on ordering
+; between literals and the formal parameters.
+; CHECK: INST_INBOUNDS_GEP {{.*}}
+; CHECK: INST_LOAD {{.*}}op0=1 {{.*}}
+; CHECK: INST_CMP2 op0=1 {{.*}}
+; CHECK: INST_RET {{.*}}op0=1
+define i1 @test_load(i32 %a, {i32, i32}* %ptr) nounwind {
+entry:
+  %0 = getelementptr inbounds {i32, i32}* %ptr, i32 %a, i32 0
+  %1 = load i32* %0
+  %2 = icmp eq i32 %1, %a
+  ret i1 %2
+}
diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll
index d080d9dd4b0ca..c812836957315 100644
--- a/test/BugPoint/crash-narrowfunctiontest.ll
+++ b/test/BugPoint/crash-narrowfunctiontest.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 define i32 @foo() { ret i32 1 }
 
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index 0eda5667ba4a9..6dc9574bbe4b1 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 ; Bugpoint should keep the call's metadata attached to the call.
 
diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll
index 29a03b8310776..5a45f846e1039 100644
--- a/test/BugPoint/remove_arguments_test.ll
+++ b/test/BugPoint/remove_arguments_test.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 ; Test to make sure that arguments are removed from the function if they are 
 ; unnecessary. And clean up any types that that frees up too.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 991cc9df1639e..e10a532341e6f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,6 +7,11 @@ configure_lit_site_cfg(
   ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg
   )
 
+# Don't include check-llvm into check-all without LLVM_BUILD_TOOLS.
+if(NOT LLVM_BUILD_TOOLS)
+  set(EXCLUDE_FROM_ALL ON)
+endif()
+
 add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   PARAMS llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
@@ -14,10 +19,16 @@ add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   DEPENDS UnitTests
           BugpointPasses LLVMHello
           llc lli llvm-ar llvm-as
-          llvm-diff
+          llvm-bcanalyzer llvm-diff
           llvm-dis llvm-extract llvm-dwarfdump
-          llvm-link llvm-mc llvm-nm llvm-objdump llvm-readobj
+          llvm-link
+          llvm-mc
+          llvm-mcmarkup
+          llvm-nm
+          llvm-objdump
+          llvm-readobj
           macho-dump opt
+          profile_rt-shared
           FileCheck count not
           yaml2obj
   )
diff --git a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
index 99db63713d42b..36d15757c3142 100644
--- a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
+++ b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
@@ -13,12 +13,12 @@
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x0000003c
-; BASIC-NEXT:         0x00000020
+; BASIC-NEXT:         0x00000022
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000001
 ; BASIC-NEXT:         0x00000000
-; BASIC-NEXT:         '411f0000 00616561 62690001 15000000 06020801 09011401 15011703 18011901'
+; BASIC-NEXT:         '41210000 00616561 62690001 17000000 060a0741 08010902 14011501 17031801 1901'
 
 ; CORTEXA8:        .ARM.attributes
 ; CORTEXA8-NEXT:         0x70000003
diff --git a/test/CodeGen/ARM/2010-12-07-PEIBug.ll b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
index 770ad4466aff2..4879f4e10bacf 100644
--- a/test/CodeGen/ARM/2010-12-07-PEIBug.ll
+++ b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a9 | FileCheck %s
 ; rdar://8728956
 
 define hidden void @foo() nounwind ssp {
diff --git a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
index 3e78c4623859c..101a91396eb74 100644
--- a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
+++ b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
@@ -1,4 +1,9 @@
 ; RUN: llc < %s -arm-tail-calls=1 | FileCheck %s
+
+; tail call inside a function where byval argument is splitted between
+; registers and stack is currently unsupported.
+; XFAIL: *
+
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-ios"
 
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index 42b14914814a7..6e0ef96196574 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -9,8 +9,8 @@ entry:
 }
 
 ; Trigger multiple NEON stores.
-; CHECK:      vstmia
-; CHECK-NEXT: vstmia
+; CHECK:      vst1.64
+; CHECK-NEXT: vst1.64
 define void @f_0_40(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 40, i32 16, i1 false)
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index 89c01d58c3985..f9ede7401a3c6 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -8,12 +8,12 @@ define void @test_sqrt(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw    r1, :lower16:{{.*}}
 ; CHECK:      movt    r1, :upper16:{{.*}}
-; CHECK:      vldmia  r1
+; CHECK:      vld1.64 {{.*}}, [r1, :128]
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64  {{.*}}
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -31,21 +31,21 @@ define void @test_cos(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -62,21 +62,21 @@ define void @test_exp(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -93,21 +93,21 @@ define void @test_exp2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -124,21 +124,21 @@ define void @test_log10(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -155,21 +155,21 @@ define void @test_log(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -186,21 +186,21 @@ define void @test_log2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -218,21 +218,21 @@ define void @test_pow(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -252,10 +252,10 @@ define void @test_powi(<4 x float>* %X) nounwind {
 
 ; CHECK:       movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:       movt  [[reg0]], :upper16:{{.*}}
-; CHECK:       vldmia  [[reg0]], {{.*}}
+; CHECK:       vld1.64 {{.*}}, :128
 ; CHECK:       vmul.f32 {{.*}}
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -275,21 +275,21 @@ define void @test_sin(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -300,3 +300,34 @@ L.entry:
 
 declare <4 x float> @llvm.sin.v4f32(<4 x float>) nounwind readonly
 
+define void @test_floor(<4 x float>* %X) nounwind {
+
+; CHECK: test_floor:
+
+; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
+; CHECK:      movt  [[reg0]], :upper16:{{.*}}
+; CHECK:      vld1.64
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      vst1.64
+
+L.entry:
+  %0 = load <4 x float>* @A, align 16
+  %1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* %X, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readonly
+
diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll
new file mode 100644
index 0000000000000..d52ef2cc5a1c9
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@@ -0,0 +1,11 @@
+; RUN: llc -O1 -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=A9-CHECK %s
+; RUN: llc -O1 -march=arm -mcpu=swift < %s | FileCheck -check-prefix=SWIFT-CHECK %s
+; Check that swift doesn't use vmov.32. <rdar://problem/10453003>.
+
+define <2 x i32> @testuvec(<2 x i32> %A, <2 x i32> %B) nounwind {
+entry:
+  %div = udiv <2 x i32> %A, %B
+  ret <2 x i32> %div
+; A9-CHECK: vmov.32
+; SWIFT-CHECK-NOT: vmov.32
+}
diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
new file mode 100644
index 0000000000000..dd678436c04e5
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm -mcpu=swift < %s | FileCheck %s
+; <rdar://problem/10451892>
+
+define void @f(i32 %x, i32* %p) nounwind ssp {
+entry:
+; CHECK-NOT: vdup.32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
+  %0 = bitcast i32* %p to i8*
+  tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
new file mode 100644
index 0000000000000..ec7f72d7c2e89
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=thumb
+; Test that this doesn't crash.
+; <rdar://problem/12183003>
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.1.0"
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+
+define void @findEdges(i8*) nounwind ssp {
+  %2 = icmp sgt i32 undef, 0
+  br i1 %2, label %5, label %3
+
+; <label>:3                                       ; preds = %5, %1
+  %4 = phi i8* [ %0, %1 ], [ %19, %5 ]
+  ret void
+
+; <label>:5                                       ; preds = %5, %1
+  %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
+  %9 = getelementptr inbounds i8* null, i32 3
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
+  %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
+  %15 = getelementptr inbounds i8* %6, i32 3
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
+  %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
+  %19 = getelementptr inbounds i8* %6, i32 48
+  %20 = bitcast <16 x i8> %13 to <2 x i64>
+  %21 = bitcast <16 x i8> %8 to <2 x i64>
+  %22 = bitcast <16 x i8> %14 to <2 x i64>
+  %23 = shufflevector <2 x i64> %22, <2 x i64> undef, <1 x i32> zeroinitializer
+  %24 = bitcast <1 x i64> %23 to <8 x i8>
+  %25 = zext <8 x i8> %24 to <8 x i16>
+  %26 = sub <8 x i16> zeroinitializer, %25
+  %27 = bitcast <16 x i8> %17 to <2 x i64>
+  %28 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %26) nounwind
+  %29 = mul <8 x i16> %28, %28
+  %30 = add <8 x i16> zeroinitializer, %29
+  %31 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> undef, <8 x i16> %30) nounwind
+  %32 = bitcast <16 x i8> %11 to <2 x i64>
+  %33 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> zeroinitializer
+  %34 = bitcast <1 x i64> %33 to <8 x i8>
+  %35 = zext <8 x i8> %34 to <8 x i16>
+  %36 = sub <8 x i16> %35, zeroinitializer
+  %37 = bitcast <16 x i8> %18 to <2 x i64>
+  %38 = shufflevector <2 x i64> %37, <2 x i64> undef, <1 x i32> zeroinitializer
+  %39 = bitcast <1 x i64> %38 to <8 x i8>
+  %40 = zext <8 x i8> %39 to <8 x i16>
+  %41 = sub <8 x i16> zeroinitializer, %40
+  %42 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %36) nounwind
+  %43 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %41) nounwind
+  %44 = mul <8 x i16> %42, %42
+  %45 = mul <8 x i16> %43, %43
+  %46 = add <8 x i16> %45, %44
+  %47 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %31, <8 x i16> %46) nounwind
+  %48 = bitcast <8 x i16> %47 to <2 x i64>
+  %49 = shufflevector <2 x i64> %48, <2 x i64> undef, <1 x i32> zeroinitializer
+  %50 = bitcast <1 x i64> %49 to <4 x i16>
+  %51 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %50, <4 x i16> undef) nounwind
+  %52 = tail call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %51, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %53 = bitcast <4 x i16> %52 to <1 x i64>
+  %54 = shufflevector <1 x i64> %53, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %55 = bitcast <2 x i64> %54 to <8 x i16>
+  %56 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %55, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %57 = shufflevector <2 x i64> %20, <2 x i64> undef, <1 x i32> <i32 1>
+  %58 = bitcast <1 x i64> %57 to <8 x i8>
+  %59 = zext <8 x i8> %58 to <8 x i16>
+  %60 = sub <8 x i16> zeroinitializer, %59
+  %61 = shufflevector <2 x i64> %21, <2 x i64> undef, <1 x i32> <i32 1>
+  %62 = bitcast <1 x i64> %61 to <8 x i8>
+  %63 = zext <8 x i8> %62 to <8 x i16>
+  %64 = sub <8 x i16> %63, zeroinitializer
+  %65 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %60) nounwind
+  %66 = mul <8 x i16> %65, %65
+  %67 = add <8 x i16> zeroinitializer, %66
+  %68 = shufflevector <2 x i64> %27, <2 x i64> undef, <1 x i32> <i32 1>
+  %69 = bitcast <1 x i64> %68 to <8 x i8>
+  %70 = zext <8 x i8> %69 to <8 x i16>
+  %71 = sub <8 x i16> zeroinitializer, %70
+  %72 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> undef) nounwind
+  %73 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %71) nounwind
+  %74 = mul <8 x i16> %72, %72
+  %75 = mul <8 x i16> %73, %73
+  %76 = add <8 x i16> %75, %74
+  %77 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> <i32 1>
+  %78 = bitcast <1 x i64> %77 to <8 x i8>
+  %79 = zext <8 x i8> %78 to <8 x i16>
+  %80 = sub <8 x i16> %79, zeroinitializer
+  %81 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %80) nounwind
+  %82 = mul <8 x i16> %81, %81
+  %83 = add <8 x i16> zeroinitializer, %82
+  %84 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %76, <8 x i16> %83) nounwind
+  %85 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %67, <8 x i16> %84) nounwind
+  %86 = bitcast <8 x i16> %85 to <2 x i64>
+  %87 = shufflevector <2 x i64> %86, <2 x i64> undef, <1 x i32> <i32 1>
+  %88 = bitcast <1 x i64> %87 to <4 x i16>
+  %89 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %88, <4 x i16> undef) nounwind
+  %90 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %89, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %91 = bitcast <4 x i16> %90 to <1 x i64>
+  %92 = shufflevector <1 x i64> undef, <1 x i64> %91, <2 x i32> <i32 0, i32 1>
+  %93 = bitcast <2 x i64> %92 to <8 x i16>
+  %94 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %93, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %95 = bitcast <8 x i8> %56 to <1 x i64>
+  %96 = bitcast <8 x i8> %94 to <1 x i64>
+  %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
+  %98 = bitcast <2 x i64> %97 to <16 x i8>
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  %99 = icmp slt i32 undef, undef
+  br i1 %99, label %5, label %3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
new file mode 100644
index 0000000000000..8471be5330b85
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-30-select.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; rdar://12201387
+
+;CHECK: select_s_v_v
+;CHECK: it  ne
+;CHECK-NEXT: vmovne.i32
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+
diff --git a/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
new file mode 100644
index 0000000000000..e761ffe72c13a
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=arm7tdmi | FileCheck %s
+
+; movw is only legal for V6T2 and later.
+; rdar://12300648
+
+define i32 @t(i32 %x) {
+; CHECK: t:
+; CHECK-NOT: movw
+  %tmp = add i32 %x, -65535
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
new file mode 100644
index 0000000000000..75766099a2200
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; Check for error message:
+; CHECK: non-trivial scalar-to-vector conversion, possible invalid constraint for vector type
+
+define void @f() nounwind ssp {
+  %1 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } asm "vldm $4, { ${0:q}, ${1:q}, ${2:q}, ${3:q} }", "=r,=r,=r,=r,r"(i64* undef) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 318437}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
new file mode 100644
index 0000000000000..6fa1391474bb8
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; Check for error message:
+; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+
+define hidden void @f(i32* %corr, i32 %order) nounwind ssp {
+  tail call void asm sideeffect "vst1.s32 { ${1:q}, ${2:q} }, [$0]", "r,{q0},{q1}"(i32* %corr, <2 x i64>* undef, <2 x i64>* undef) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 257}
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
new file mode 100644
index 0000000000000..b5f6d311cb9cf
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+; Test that we correctly use registers and align elements when using va_arg
+
+%struct_t = type { double, double, double }
+@static_val = constant %struct_t { double 1.0, double 2.0, double 3.0 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+; CHECK: test_byval_8_bytes_alignment:
+define void @test_byval_8_bytes_alignment(i32 %i, ...) {
+entry:
+; CHECK: stm     r0, {r1, r2, r3}
+  %g = alloca i8*
+  %g1 = bitcast i8** %g to i8*
+  call void @llvm.va_start(i8* %g1)
+
+; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc	[[REG]], #0, #3
+  %0 = va_arg i8** %g, double
+  call void @llvm.va_end(i8* %g1)
+  
+  ret void
+}
+
+; CHECK: main:
+; CHECK: ldm     r0, {r2, r3}
+define i32 @main() {
+entry:
+  call void (i32, ...)* @test_byval_8_bytes_alignment(i32 555, %struct_t* byval @static_val)
+  ret i32 0
+}
+
+declare void @f(double);
+
+; CHECK:     test_byval_8_bytes_alignment_fixed_arg:
+; CHECK-NOT:   str     r1
+; CHECK:       str     r3, [sp, #12]
+; CHECK:       str     r2, [sp, #8]
+; CHECK-NOT:   str     r1
+define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %val) nounwind {
+entry:
+  %a = getelementptr inbounds %struct_t* %val, i32 0, i32 0
+  %0 = load double* %a
+  call void (double)* @f(double %0)
+  ret void
+}
+
+; CHECK: main_fixed_arg:
+; CHECK: ldm     r0, {r2, r3}
+define i32 @main_fixed_arg() {
+entry:
+  call void (i32, %struct_t*)* @test_byval_8_bytes_alignment_fixed_arg(i32 555, %struct_t* byval @static_val)
+  ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
new file mode 100644
index 0000000000000..478048d09600c
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+
+@.str = private unnamed_addr constant [12 x i8] c"val.a = %f\0A\00"
+%struct_t = type { double, double, double }
+@static_val = constant %struct_t { double 1.0, double 2.0, double 3.0 }
+
+declare i32 @printf(i8*, ...)
+
+; CHECK:     test_byval_usage_scheduling:
+; CHECK:       str     r3, [sp, #12]
+; CHECK:       str     r2, [sp, #8]
+; CHECK:       vldr    d16, [sp, #8]
+define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val) nounwind {
+entry:
+  %a = getelementptr inbounds %struct_t* %val, i32 0, i32 0
+  %0 = load double* %a
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), double %0)
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll b/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll
new file mode 100644
index 0000000000000..f2395107d426f
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=armv7-none-linux- | FileCheck %s
+; Check that LDRB_POST_IMM instruction emitted properly.
+
+%my_struct_t = type { i8, i8, i8, i8, i8 }
+@main.val = private unnamed_addr constant %my_struct_t { i8 1, i8 2, i8 3, i8 4, i8 5 }
+
+declare void @f(i32 %n1, i32 %n2, i32 %n3, %my_struct_t* byval %val);
+
+; CHECK: main:
+define i32 @main() nounwind {
+entry:
+; CHECK: ldrb	{{(r[0-9]+)}}, {{(\[r[0-9]+\])}}, #1
+  call void @f(i32 555, i32 555, i32 555, %my_struct_t* byval @main.val)
+  ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
new file mode 100644
index 0000000000000..fcc6a7f7e96f2
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s
+
+%struct.s = type { [4 x i32] }
+@v = constant %struct.s zeroinitializer; 
+
+declare void @f(%struct.s* %p);
+
+; CHECK: t:
+define void @t(i32 %a, %struct.s* byval %s) nounwind {
+entry:
+
+; Here we need to only check proper start address of restored %s argument.
+; CHECK:      sub     sp, sp, #16
+; CHECK:      push    {r11, lr}
+; CHECK:      add     r0, sp, #12
+; CHECK:      stm     r0, {r1, r2, r3}
+; CHECK:      add     r0, sp, #12
+; CHECK-NEXT: bl f
+  call void @f(%struct.s* %s)
+  ret void
+}
+
+; CHECK: caller:
+define void @caller() {
+
+; CHECK:      ldm     r0, {r1, r2, r3}
+  call void @t(i32 0, %struct.s* @v);
+  ret void
+}
diff --git a/test/CodeGen/ARM/a15-mla.ll b/test/CodeGen/ARM/a15-mla.ll
new file mode 100644
index 0000000000000..25f6de4762d56
--- /dev/null
+++ b/test/CodeGen/ARM/a15-mla.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s  -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s
+
+; This test checks that the VMLxForwarting feature is disabled for A15.
+; CHECK: fun_a
+define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{
+  %1 = add <4 x i32> %x, %y
+; CHECK-NOT: vmul
+; CHECK: vmla
+  %2 = mul <4 x i32> %1, %1
+  %3 = add <4 x i32> %y, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/ARM/a15.ll b/test/CodeGen/ARM/a15.ll
new file mode 100644
index 0000000000000..6f816c1c2c53e
--- /dev/null
+++ b/test/CodeGen/ARM/a15.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s  -mcpu=cortex-a15 | FileCheck %s
+
+; CHECK: a
+define i32 @a(i32 %x) {
+  ret i32 %x;
+}
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 8967730835a51..6e6b36377fde5 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -159,3 +159,13 @@ entry:
   store i8 %3, i8* %old
   ret void
 }
+
+; CHECK: func4
+; This function should not need to use callee-saved registers.
+; rdar://problem/12203728
+; CHECK-NOT: r4
+define i32 @func4(i32* %p) nounwind optsize ssp {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 monotonic
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/atomicrmw_minmax.ll b/test/CodeGen/ARM/atomicrmw_minmax.ll
new file mode 100644
index 0000000000000..69f1384e125cb
--- /dev/null
+++ b/test/CodeGen/ARM/atomicrmw_minmax.ll
@@ -0,0 +1,21 @@
+;  RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+
+;  CHECK: max:
+define i32 @max(i8 %ctx, i32* %ptr, i32 %val)
+{
+;  CHECK: ldrex
+;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
+;  CHECK: movhi {{r[0-9]*}}, [[old]]
+  %old = atomicrmw umax i32* %ptr, i32 %val monotonic
+  ret i32 %old
+}
+
+;  CHECK: min:
+define i32 @min(i8 %ctx, i32* %ptr, i32 %val)
+{
+;  CHECK: ldrex
+;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
+;  CHECK: movlo {{r[0-9]*}}, [[old]]
+  %old = atomicrmw umin i32* %ptr, i32 %val monotonic
+  ret i32 %old
+}
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 1b385ab79c4ea..96e83dd88e927 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s
 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
 ; dependency) when it isn't dependent on last CPSR defining instruction.
 ; rdar://8928208
diff --git a/test/CodeGen/ARM/call-noret-minsize.ll b/test/CodeGen/ARM/call-noret-minsize.ll
new file mode 100644
index 0000000000000..df3c19eca6a02
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret-minsize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; rdar://12348580
+
+define void @t1() noreturn minsize nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: bl _bar
+
+; SWIFT: t1:
+; SWIFT: bl _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn minsize nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: bl _t1
+
+; SWIFT: t2:
+; SWIFT: bl _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/call-noret.ll b/test/CodeGen/ARM/call-noret.ll
new file mode 100644
index 0000000000000..27062dca38dce
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; rdar://8979299
+
+define void @t1() noreturn nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: mov lr, pc
+; ARM: b _bar
+
+; SWIFT: t1:
+; SWIFT: mov lr, pc
+; SWIFT: b _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: mov lr, pc
+; ARM: b _t1
+
+; SWIFT: t2:
+; SWIFT: mov lr, pc
+; SWIFT: b _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index f84774d9b6159..bf51cd627b3c8 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -45,3 +45,16 @@ entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
+
+; rdar://12559385
+define i64 @f5(i32 %vi) {
+entry:
+; CHECK: f5:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index fb0f4c67c9271..3ba947579a3a2 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a9 -verify-coalescing -verify-machineinstrs | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios0.0.0"
 
@@ -66,3 +66,295 @@ do.end:                                           ; preds = %do.body
 
 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+
+; CHECK: f3
+; This function has lane insertions that span basic blocks.
+; The trivial REG_SEQUENCE lowering can't handle that, but the coalescer can.
+;
+; void f3(float *p, float *q) {
+;   float32x2_t x;
+;   x[1] = p[3];
+;   if (q)
+;     x[0] = q[0] + q[1];
+;   else
+;     x[0] = p[2];
+;   vst1_f32(p+4, x);
+; }
+;
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f3(float* %p, float* %q) nounwind ssp {
+entry:
+  %arrayidx = getelementptr inbounds float* %p, i32 3
+  %0 = load float* %arrayidx, align 4
+  %vecins = insertelement <2 x float> undef, float %0, i32 1
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx2 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx2, align 4
+  %add = fadd float %1, %2
+  %vecins3 = insertelement <2 x float> %vecins, float %add, i32 0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx4 = getelementptr inbounds float* %p, i32 2
+  %3 = load float* %arrayidx4, align 4
+  %vecins5 = insertelement <2 x float> %vecins, float %3, i32 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
+  %add.ptr = getelementptr inbounds float* %p, i32 4
+  %4 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+
+; CHECK: f4
+; This function inserts a lane into a fully defined vector.
+; The destination lane isn't read, so the subregs can coalesce.
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f4(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx1 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx1, align 4
+  %add = fadd float %1, %2
+  %vecins = insertelement <2 x float> %vld1, float %add, i32 1
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+; CHECK: f5
+; Coalesce vector lanes through phis.
+; CHECK: vmov.f32 {{.*}}, #1.0
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+; CHECK: %if.end
+; We may leave the last insertelement in the if.end block.
+; It is inserting the %add value into a dead lane, but %add causes interference
+; in the entry block, and we don't do dead lane checks across basic blocks.
+define void @f5(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
+  %vecext = extractelement <4 x float> %vld1, i32 0
+  %vecext1 = extractelement <4 x float> %vld1, i32 1
+  %vecext2 = extractelement <4 x float> %vld1, i32 2
+  %vecext3 = extractelement <4 x float> %vld1, i32 3
+  %add = fadd float %vecext3, 1.000000e+00
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds float* %q, i32 1
+  %1 = load float* %arrayidx, align 4
+  %add4 = fadd float %vecext, %1
+  %2 = load float* %q, align 4
+  %add6 = fadd float %vecext1, %2
+  %arrayidx7 = getelementptr inbounds float* %q, i32 2
+  %3 = load float* %arrayidx7, align 4
+  %add8 = fadd float %vecext2, %3
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %a.0 = phi float [ %add4, %if.then ], [ %vecext, %entry ]
+  %b.0 = phi float [ %add6, %if.then ], [ %vecext1, %entry ]
+  %c.0 = phi float [ %add8, %if.then ], [ %vecext2, %entry ]
+  %vecinit = insertelement <4 x float> undef, float %a.0, i32 0
+  %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
+  %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
+  %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
+  ret void
+}
+
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+
+; CHECK: pr13999
+define void @pr13999() nounwind readonly {
+entry:
+ br i1 true, label %outer_loop, label %loop.end
+
+outer_loop:
+ %d = phi double [ 0.0, %entry ], [ %add, %after_inner_loop ]
+ %0 = insertelement <2 x double> <double 0.0, double 0.0>, double %d, i32 0
+ br i1 undef, label %after_inner_loop, label %inner_loop
+
+inner_loop:
+ br i1 true, label %after_inner_loop, label %inner_loop
+
+after_inner_loop:
+ %1 = phi <2 x double> [ %0, %outer_loop ], [ <double 0.0, double 0.0>,
+%inner_loop ]
+ %2 = extractelement <2 x double> %1, i32 1
+ %add = fadd double 1.0, %2
+ br i1 false, label %loop.end, label %outer_loop
+
+loop.end:
+ %d.end = phi double [ 0.0, %entry ], [ %add, %after_inner_loop ]
+ ret void
+}
+
+; CHECK: pr14078
+define arm_aapcs_vfpcc i32 @pr14078(i8* nocapture %arg, i8* nocapture %arg1, i32 %arg2) nounwind uwtable readonly {
+bb:
+  br i1 undef, label %bb31, label %bb3
+
+bb3:                                              ; preds = %bb12, %bb
+  %tmp = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp4 = bitcast <1 x i64> %tmp to <2 x float>
+  %tmp5 = shufflevector <2 x float> %tmp4, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp6 = bitcast <4 x float> %tmp5 to <2 x i64>
+  %tmp7 = shufflevector <2 x i64> %tmp6, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp8 = bitcast <1 x i64> %tmp7 to <2 x float>
+  %tmp9 = tail call <2 x float> @baz(<2 x float> <float 0xFFFFFFFFE0000000, float 0.000000e+00>, <2 x float> %tmp8, <2 x float> zeroinitializer) nounwind
+  br i1 undef, label %bb10, label %bb12
+
+bb10:                                             ; preds = %bb3
+  %tmp11 = load <4 x float>* undef, align 8
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb3
+  %tmp13 = shufflevector <2 x float> %tmp9, <2 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %tmp14 = bitcast <2 x float> %tmp13 to <1 x i64>
+  %tmp15 = shufflevector <1 x i64> %tmp14, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  %tmp16 = bitcast <2 x i64> %tmp15 to <4 x float>
+  %tmp17 = fmul <4 x float> zeroinitializer, %tmp16
+  %tmp18 = bitcast <4 x float> %tmp17 to <2 x i64>
+  %tmp19 = shufflevector <2 x i64> %tmp18, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp20 = bitcast <1 x i64> %tmp19 to <2 x float>
+  %tmp21 = tail call <2 x float> @baz67(<2 x float> %tmp20, <2 x float> undef) nounwind
+  %tmp22 = tail call <2 x float> @baz67(<2 x float> %tmp21, <2 x float> %tmp21) nounwind
+  %tmp23 = shufflevector <2 x float> %tmp22, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp24 = bitcast <4 x float> %tmp23 to <2 x i64>
+  %tmp25 = shufflevector <2 x i64> %tmp24, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp26 = bitcast <1 x i64> %tmp25 to <2 x float>
+  %tmp27 = extractelement <2 x float> %tmp26, i32 0
+  %tmp28 = fcmp olt float %tmp27, 0.000000e+00
+  %tmp29 = select i1 %tmp28, i32 0, i32 undef
+  %tmp30 = icmp ult i32 undef, %arg2
+  br i1 %tmp30, label %bb3, label %bb31
+
+bb31:                                             ; preds = %bb12, %bb
+  %tmp32 = phi i32 [ 1, %bb ], [ %tmp29, %bb12 ]
+  ret i32 %tmp32
+}
+
+declare <2 x float> @baz(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+
+declare <2 x float> @baz67(<2 x float>, <2 x float>) nounwind readnone
+
+%struct.wombat.5 = type { %struct.quux, %struct.quux, %struct.quux, %struct.quux }
+%struct.quux = type { <4 x float> }
+
+; CHECK: pr14079
+define linkonce_odr arm_aapcs_vfpcc %struct.wombat.5 @pr14079(i8* nocapture %arg, i8* nocapture %arg1, i8* nocapture %arg2) nounwind uwtable inlinehint {
+bb:
+  %tmp = shufflevector <2 x i64> zeroinitializer, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp3 = bitcast <1 x i64> %tmp to <2 x float>
+  %tmp4 = shufflevector <2 x float> %tmp3, <2 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %tmp5 = shufflevector <2 x float> %tmp4, <2 x float> undef, <2 x i32> <i32 1, i32 3>
+  %tmp6 = bitcast <2 x float> %tmp5 to <1 x i64>
+  %tmp7 = shufflevector <1 x i64> undef, <1 x i64> %tmp6, <2 x i32> <i32 0, i32 1>
+  %tmp8 = bitcast <2 x i64> %tmp7 to <4 x float>
+  %tmp9 = shufflevector <2 x i64> zeroinitializer, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp10 = bitcast <1 x i64> %tmp9 to <2 x float>
+  %tmp11 = shufflevector <2 x float> %tmp10, <2 x float> undef, <2 x i32> <i32 0, i32 2>
+  %tmp12 = shufflevector <2 x float> %tmp11, <2 x float> undef, <2 x i32> <i32 0, i32 2>
+  %tmp13 = bitcast <2 x float> %tmp12 to <1 x i64>
+  %tmp14 = shufflevector <1 x i64> %tmp13, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %tmp15 = bitcast <2 x i64> %tmp14 to <4 x float>
+  %tmp16 = insertvalue %struct.wombat.5 undef, <4 x float> %tmp8, 1, 0
+  %tmp17 = insertvalue %struct.wombat.5 %tmp16, <4 x float> %tmp15, 2, 0
+  %tmp18 = insertvalue %struct.wombat.5 %tmp17, <4 x float> undef, 3, 0
+  ret %struct.wombat.5 %tmp18
+}
+
+; CHECK: adjustCopiesBackFrom
+; The shuffle in if.else3 must be preserved even though adjustCopiesBackFrom
+; is tempted to remove it.
+; CHECK: %if.else3
+; CHECK: vorr d
+define internal void @adjustCopiesBackFrom(<2 x i64>* noalias nocapture sret %agg.result, <2 x i64> %in) {
+entry:
+  %0 = extractelement <2 x i64> %in, i32 0
+  %cmp = icmp slt i64 %0, 1
+  %.in = select i1 %cmp, <2 x i64> <i64 0, i64 undef>, <2 x i64> %in
+  %1 = extractelement <2 x i64> %in, i32 1
+  %cmp1 = icmp slt i64 %1, 1
+  br i1 %cmp1, label %if.then2, label %if.else3
+
+if.then2:                                         ; preds = %entry
+  %2 = insertelement <2 x i64> %.in, i64 0, i32 1
+  br label %if.end4
+
+if.else3:                                         ; preds = %entry
+  %3 = shufflevector <2 x i64> %.in, <2 x i64> %in, <2 x i32> <i32 0, i32 3>
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else3, %if.then2
+  %result.2 = phi <2 x i64> [ %2, %if.then2 ], [ %3, %if.else3 ]
+  store <2 x i64> %result.2, <2 x i64>* %agg.result, align 128
+  ret void
+}
+
+; <rdar://problem/12758887>
+; RegisterCoalescer::updateRegDefsUses() could visit an instruction more than
+; once under rare circumstances. When widening a register from QPR to DTriple
+; with the original virtual register in dsub_1_dsub_2, the double rewrite would
+; produce an invalid sub-register.
+;
+; This is because dsub_1_dsub_2 is not an idempotent sub-register index.
+; It will translate %vr:dsub_0 -> %vr:dsub_1.
+define hidden fastcc void @radar12758887() nounwind optsize ssp {
+entry:
+  br i1 undef, label %for.body, label %for.end70
+
+for.body:                                         ; preds = %for.end, %entry
+  br i1 undef, label %for.body29, label %for.end
+
+for.body29:                                       ; preds = %for.body29, %for.body
+  %0 = load <2 x double>* null, align 1
+  %splat40 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul41 = fmul <2 x double> undef, %splat40
+  %add42 = fadd <2 x double> undef, %mul41
+  %splat44 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul45 = fmul <2 x double> undef, %splat44
+  %add46 = fadd <2 x double> undef, %mul45
+  br i1 undef, label %for.end, label %for.body29
+
+for.end:                                          ; preds = %for.body29, %for.body
+  %accumR2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add42, %for.body29 ]
+  %accumI2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add46, %for.body29 ]
+  %1 = shufflevector <2 x double> %accumI2.0.lcssa, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %add58 = fadd <2 x double> undef, %1
+  %mul61 = fmul <2 x double> %add58, undef
+  %add63 = fadd <2 x double> undef, %mul61
+  %add64 = fadd <2 x double> undef, %add63
+  %add67 = fadd <2 x double> undef, %add64
+  store <2 x double> %add67, <2 x double>* undef, align 1
+  br i1 undef, label %for.end70, label %for.body
+
+for.end70:                                        ; preds = %for.end, %entry
+  ret void
+}
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index f4c1b5acef91e..3baa103e3d5df 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv4t-unknown-linux-gnueabi -disable-cgp-branch-opts | FileCheck %s
+; RUN: llc < %s -mtriple=armv4t-unknown-linux-gnueabi -disable-cgp-branch-opts -verify-machineinstrs | FileCheck %s
 
 define i32 @f1() {
 ; CHECK: f1
@@ -45,6 +45,16 @@ r:
         ret void
 }
 
+define i32 @f8() nounwind {
+; Check that constant propagation through (i32)-1 => (float)Nan => (i32)-1
+; gives expected result
+; CHECK: f8
+; CHECK: mvn r0, #0
+        %tmp0 = bitcast i32 -1 to float
+        %tmp1 = bitcast float %tmp0 to i32
+        ret i32 %tmp1
+}
+
 %t1 = type { <3 x float>, <3 x float> }
 
 @const1 = global %t1 { <3 x float> zeroinitializer,
diff --git a/test/CodeGen/ARM/crash-shufflevector.ll b/test/CodeGen/ARM/crash-shufflevector.ll
new file mode 100644
index 0000000000000..bdc0e0ea4db07
--- /dev/null
+++ b/test/CodeGen/ARM/crash-shufflevector.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=armv7
+
+declare void @g(<16 x i8>)
+define void @f(<4 x i8> %param1, <4 x i8> %param2) {
+   %y1 = shufflevector <4 x i8> %param1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+   %y2 = shufflevector <4 x i8> %param2, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+   %z = shufflevector <16 x i8> %y1, <16 x i8> %y2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+   call void @g(<16 x i8> %z)
+   ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/darwin-section-order.ll b/test/CodeGen/ARM/darwin-section-order.ll
new file mode 100644
index 0000000000000..701028c0a5373
--- /dev/null
+++ b/test/CodeGen/ARM/darwin-section-order.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+
+; CHECK: .section	__TEXT,__text,regular,pure_instructions
+; CHECK: .section	__TEXT,myprecious
+; CHECK: .section	__TEXT,__textcoal_nt,coalesced,pure_instructions
+; CHECK: .section	__TEXT,__const_coal,coalesced
+; CHECK: .section	__TEXT,__picsymbolstub4,symbol_stubs,none,16
+; CHECK: .section	__TEXT,__StaticInit,regular,pure_instructions
+
+
+define void @normal() nounwind readnone {
+; CHECK: .section	__TEXT,__text,regular,pure_instructions
+; CHECK: _normal:
+  ret void
+}
+
+define void @special() nounwind readnone section "__TEXT,myprecious" {
+; CHECK: .section	__TEXT,myprecious
+; CHECK: _special:
+  ret void
+}
diff --git a/test/CodeGen/ARM/deps-fix.ll b/test/CodeGen/ARM/deps-fix.ll
new file mode 100644
index 0000000000000..288697a4dc7fc
--- /dev/null
+++ b/test/CodeGen/ARM/deps-fix.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard -mtriple armv7-linux-gnueabi | FileCheck %s
+
+;; This test checks that the ExecutionDepsFix pass performs the domain changes
+;; even when some dependencies are propagated through implicit definitions.
+
+; CHECK: fun_a
+define <4 x float> @fun_a(<4 x float> %in, <4 x float> %x, float %y) nounwind {
+; CHECK: vext
+; CHECK: vext
+; CHECK: vadd.f32
+  %1 = insertelement <4 x float> %in, float %y, i32 0
+  %2 = fadd <4 x float> %1, %x  
+  ret <4 x float> %2
+}
+; CHECK: fun_b
+define <4 x i32> @fun_b(<4 x i32> %in, <4 x i32> %x, i32 %y) nounwind {
+; CHECK: vmov.32
+; CHECK: vadd.i32
+  %1 = insertelement <4 x i32> %in, i32 %y, i32 0
+  %2 = add <4 x i32> %1, %x  
+  ret <4 x i32> %2
+}
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 3d29e05a0ccf2..82cfca182b80f 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=CHECK-SWIFT
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f1
 ; CHECK-ARM: __divsi3
+
+; CHECK-SWIFT: f1
+; CHECK-SWIFT: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -12,6 +16,9 @@ define i32 @f2(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f2
 ; CHECK-ARM: __udivsi3
+
+; CHECK-SWIFT: f2
+; CHECK-SWIFT: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -20,6 +27,10 @@ define i32 @f3(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f3
 ; CHECK-ARM: __modsi3
+
+; CHECK-SWIFT: f3
+; CHECK-SWIFT: sdiv
+; CHECK-SWIFT: mls
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -28,6 +39,10 @@ define i32 @f4(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f4
 ; CHECK-ARM: __umodsi3
+
+; CHECK-SWIFT: f4
+; CHECK-SWIFT: udiv
+; CHECK-SWIFT: mls
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 7fbf8f4090366..577f8aa7d39b3 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,10 +1,18 @@
-; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+
+; rdar://12481395
 
 define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
-; CHECK: foo:
-; CHECK: bl ___divmodsi4
-; CHECK-NOT: bl ___divmodsi4
+; A8: foo:
+; A8: bl ___divmodsi4
+; A8-NOT: bl ___divmodsi4
+
+; SWIFT: foo:
+; SWIFT: sdiv
+; SWIFT: mls
+; SWIFT-NOT: bl __divmodsi4
   %div = sdiv i32 %x, %y
   store i32 %div, i32* %P, align 4
   %rem = srem i32 %x, %y
@@ -15,9 +23,14 @@ entry:
 
 define void @bar(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
-; CHECK: bar:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: bl ___udivmodsi4
+; A8: bar:
+; A8: bl ___udivmodsi4
+; A8-NOT: bl ___udivmodsi4
+
+; SWIFT: bar:
+; SWIFT: udiv
+; SWIFT: mls
+; SWIFT-NOT: bl __udivmodsi4
   %div = udiv i32 %x, %y
   store i32 %div, i32* %P, align 4
   %rem = urem i32 %x, %y
@@ -32,14 +45,18 @@ entry:
 
 define void @do_indent(i32 %cols) nounwind {
 entry:
-; CHECK: do_indent:
+; A8: do_indent:
+; SWIFT: do_indent:
   %0 = load i32* @flags, align 4
   %1 = and i32 %0, 67108864
   %2 = icmp eq i32 %1, 0
   br i1 %2, label %bb1, label %bb
 
 bb:
-; CHECK: bl ___divmodsi4
+; A8: bl ___divmodsi4
+; SWIFT: sdiv
+; SWIFT: mls
+; SWIFT-NOT: bl __divmodsi4
   %3 = load i32* @tabsize, align 4
   %4 = srem i32 %cols, %3
   %5 = sdiv i32 %cols, %3
@@ -60,9 +77,14 @@ declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
 ; rdar://11714607
 define i32 @howmany(i32 %x, i32 %y) nounwind {
 entry:
-; CHECK: howmany:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: ___udivsi3
+; A8: howmany:
+; A8: bl ___udivmodsi4
+; A8-NOT: ___udivsi3
+
+; SWIFT: howmany:
+; SWIFT: udiv
+; SWIFT: mls
+; SWIFT-NOT: bl __udivmodsi4
   %rem = urem i32 %x, %y
   %div = udiv i32 %x, %y
   %not.cmp = icmp ne i32 %rem, 0
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
new file mode 100644
index 0000000000000..a5c41144584c4
--- /dev/null
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -0,0 +1,100 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard < %s | FileCheck %s
+
+define <2 x float> @test_vmovs_via_vext_lane0to0(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane0to1(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to0(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to1(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+
+define float @test_vmovs_via_vdup(float, float %ret, float %lhs, float %rhs) {
+; CHECK: test_vmovs_via_vdup:
+
+  ; Do an operation (which will end up NEON because of +neonfp) to convince the
+  ; execution-domain pass that NEON is a good thing to use.
+  %res = fadd float %ret, %ret
+  ;  It makes sense for LLVM to do the addition in d0 here, because it's going
+  ;  to be returned. This means it will want a "vmov s0, s1":
+; CHECK: vdup.32 d0, d0[1]
+
+  ret float %res
+}
+
+declare float @llvm.sqrt.f32(float)
+
+declare void @bar()
+
+; This is a comp
+define float @test_ineligible(float, float %in) {
+; CHECK: test_ineligible:
+
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %val = fadd float %sqrt, %sqrt
+
+  ; This call forces a move from a callee-saved register to the return-reg. That
+  ; move is not eligible for conversion to a d-register instructions because the
+  ; use-def chains would be messed up. Primarily a compile-test (we used to
+  ; internal fault).
+  call void @bar()
+; CHECL: bl bar
+; CHECK: vext.32
+; CHECK: vext.32
+  ret float %val
+}
+
+define i32 @test_vmovs_no_sreg(i32 %in) {
+; CHECK: test_vmovs_no_sreg:
+
+  ; Check that the movement to and from GPRs takes place in the NEON domain.
+; CHECK: vmov.32 d
+  %x = bitcast i32 %in to float
+
+  %res = fadd float %x, %x
+
+; CHECK: vmov.32 r{{[0-9]+}}, d
+  %resi = bitcast float %res to i32
+
+  ret i32 %resi
+}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index bcb4ee745234a..46c2f1c65fe5c 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -14,12 +14,12 @@ entry:
 declare float @fabsf(float)
 
 ; VFP2: test:
-; VFP2: 	vabs.f32	s1, s1
+; VFP2: 	vabs.f32	s2, s2
 
 ; NFP1: test:
 ; NFP1: 	vabs.f32	d1, d1
 ; NFP0: test:
-; NFP0: 	vabs.f32	s1, s1
+; NFP0: 	vabs.f32	s2, s2
 
 ; CORTEXA8: test:
 ; CORTEXA8:     vadd.f32        [[D1:d[0-9]+]]
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index e35103c045eb8..48ef5ed88fb00 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vadd.f32	s0, s1, s0
+; VFP2: 	vadd.f32	s
 
 ; NFP1: test:
-; NFP1: 	vadd.f32	d0, d1, d0
+; NFP1: 	vadd.f32	d
 ; NFP0: test:
-; NFP0: 	vadd.f32	s0, s1, s0
+; NFP0: 	vadd.f32	s
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vadd.f32	d0, d1, d0
+; CORTEXA8: 	vadd.f32	d
 ; CORTEXA9: test:
 ; CORTEXA9: 	vadd.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
new file mode 100644
index 0000000000000..867d53f973db4
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
+
+@g = global i32 0, align 4
+
+define i32 @LoadGV() {
+entry:
+; THUMB: LoadGV
+; THUMB: movw [[reg0:r[0-9]+]],
+; THUMB: movt [[reg0]],
+; THUMB: add  [[reg0]], pc
+; THUMB-ELF: LoadGV
+; THUMB-ELF: ldr.n r[[reg0:[0-9]+]],
+; THUMB-ELF: ldr.n r[[reg1:[0-9]+]],
+; THUMB-ELF: ldr r[[reg0]], [r[[reg1]], r[[reg0]]]
+; ARM: LoadGV
+; ARM: ldr [[reg1:r[0-9]+]],
+; ARM: add [[reg1]], pc, [[reg1]]
+; ARMv7: LoadGV
+; ARMv7: movw [[reg2:r[0-9]+]],
+; ARMv7: movt [[reg2]],
+; ARMv7: add  [[reg2]], pc, [[reg2]]
+; ARMv7-ELF: LoadGV
+; ARMv7-ELF: ldr r[[reg2:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg3:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg2]], [r[[reg3]], r[[reg2]]]
+  %tmp = load i32* @g
+  ret i32 %tmp
+}
+
+@i = external global i32
+
+define i32 @LoadIndirectSymbol() {
+entry:
+; THUMB: LoadIndirectSymbol
+; THUMB: movw r[[reg3:[0-9]+]],
+; THUMB: movt r[[reg3]],
+; THUMB: add  r[[reg3]], pc
+; THUMB: ldr  r[[reg3]], [r[[reg3]]]
+; THUMB-ELF: LoadIndirectSymbol
+; THUMB-ELF: ldr.n r[[reg3:[0-9]+]],
+; THUMB-ELF: ldr.n r[[reg4:[0-9]+]],
+; THUMB-ELF: ldr r[[reg3]], [r[[reg4]], r[[reg3]]]
+; ARM: LoadIndirectSymbol
+; ARM: ldr [[reg4:r[0-9]+]],
+; ARM: ldr [[reg4]], [pc, [[reg4]]]
+; ARMv7: LoadIndirectSymbol
+; ARMv7: movw r[[reg5:[0-9]+]],
+; ARMv7: movt r[[reg5]],
+; ARMv7: add  r[[reg5]], pc, r[[reg5]]
+; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
+; ARMv7-ELF: LoadIndirectSymbol
+; ARMv7-ELF: ldr r[[reg5:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg6:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg5]], [r[[reg6]], r[[reg5]]]
+  %tmp = load i32* @i
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index ecd5fe27a4b7a..41fda41326326 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Very basic fast-isel functionality.
 define i32 @add(i32 %a, i32 %b) nounwind {
@@ -238,3 +240,67 @@ entry:
 }
 
 declare void @llvm.trap() nounwind
+
+define void @unaligned_i16_store(i16 %x, i16* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i16 %x, i16* %y, align 1
+  ret void
+}
+
+define i16 @unaligned_i16_load(i16* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i16* %x, align 1
+  ret i16 %0
+}
+
+define void @unaligned_i32_store(i32 %x, i32* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i32 %x, i32* %y, align 1
+  ret void
+}
+
+define i32 @unaligned_i32_load(i32* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i32* %x, align 1
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 31c1ca940502f..8fab00213585f 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vdiv.f32	s0, s1, s0
+; VFP2: 	vdiv.f32	s0, s2, s0
 
 ; NFP1: test:
-; NFP1: 	vdiv.f32	s0, s1, s0
+; NFP1: 	vdiv.f32	s0, s2, s0
 ; NFP0: test:
-; NFP0: 	vdiv.f32	s0, s1, s0
+; NFP0: 	vdiv.f32	s0, s2, s0
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vdiv.f32	s0, s1, s0
+; CORTEXA8: 	vdiv.f32	s0, s2, s0
 ; CORTEXA9: test:
 ; CORTEXA9: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index 3c3182bc6341d..1566a9272db15 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -10,15 +10,15 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vmul.f32	s0, s1, s0
+; VFP2: 	vmul.f32	s
 
 ; NFP1: test:
-; NFP1: 	vmul.f32	d0, d1, d0
+; NFP1: 	vmul.f32	d
 ; NFP0: test:
-; NFP0: 	vmul.f32	s0, s1, s0
+; NFP0: 	vmul.f32	s
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vmul.f32	d0, d1, d0
+; CORTEXA8: 	vmul.f32	d
 ; CORTEXA9: test:
 ; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
 
diff --git a/test/CodeGen/ARM/fp-fast.ll b/test/CodeGen/ARM/fp-fast.ll
new file mode 100644
index 0000000000000..ec57187381776
--- /dev/null
+++ b/test/CodeGen/ARM/fp-fast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %t1)
+  ret float %t2
+}
+
+; CHECK: test2
+define float @test2(float %x, float %y) {
+; CHECK-NOT: vmul
+; CHECK: vfma.f32
+; CHECK-NOT: vmul
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %t1, float 2.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test3
+define float @test3(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vadd.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test4
+define float @test4(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vsub.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float -1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %x)
+  ret float %t2
+}
+
+; CHECK: test6
+define float @test6(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fsub float -0.0, %x
+  %t2 = call float @llvm.fma.f32(float %x, float 5.0, float %t1)
+  ret float %t2
+}
+
+declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index 7002cecf36401..44298b9c5d8da 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -31,7 +31,7 @@ define float @test3(i32 %a, i32 %b) {
 ; VFP2: test3:
 ; VFP2: vcvt.f32.u32 s{{.}}, s{{.}}
 ; NEON: test3:
-; NEON: vcvt.f32.u32 d0, d0
+; NEON: vcvt.f32.u32 d
 entry:
         %0 = add i32 %a, %b
         %1 = uitofp i32 %0 to float
@@ -42,7 +42,7 @@ define float @test4(i32 %a, i32 %b) {
 ; VFP2: test4:
 ; VFP2: vcvt.f32.s32 s{{.}}, s{{.}}
 ; NEON: test4:
-; NEON: vcvt.f32.s32 d0, d0
+; NEON: vcvt.f32.s32 d
 entry:
         %0 = add i32 %a, %b
         %1 = sitofp i32 %0 to float
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
index bea8d5f4f30bc..f039e74c8ee6c 100644
--- a/test/CodeGen/ARM/fsubs.ll
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -8,6 +8,6 @@ entry:
 	ret float %0
 }
 
-; VFP2: vsub.f32	s0, s1, s0
-; NFP1: vsub.f32	d0, d1, d0
-; NFP0: vsub.f32	s0, s1, s0
+; VFP2: vsub.f32	s
+; NFP1: vsub.f32	d
+; NFP0: vsub.f32	s
diff --git a/test/CodeGen/ARM/ifcvt1.ll b/test/CodeGen/ARM/ifcvt1.ll
index cd870bb5d4b28..fd831442c14bf 100644
--- a/test/CodeGen/ARM/ifcvt1.ll
+++ b/test/CodeGen/ARM/ifcvt1.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -march=arm -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
 
 define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
+; A8: t1:
+; SWIFT: t1:
 	%tmp2 = icmp eq i32 %a, 0
 	br i1 %tmp2, label %cond_false, label %cond_true
 
 cond_true:
-; CHECK: subeq r0, r1, #1
+; A8: subeq r0, r1, #1
+; SWIFT: sub r0, r1, #1
 	%tmp5 = add i32 %b, 1
 	ret i32 %tmp5
 
 cond_false:
-; CHECK: addne r0, r1, #1
+; A8: addne r0, r1, #1
+; SWIFT: addne r0, r1, #1
 	%tmp7 = add i32 %b, -1
 	ret i32 %tmp7
 }
diff --git a/test/CodeGen/ARM/ifcvt12.ll b/test/CodeGen/ARM/ifcvt12.ll
new file mode 100644
index 0000000000000..77bdca57e555e
--- /dev/null
+++ b/test/CodeGen/ARM/ifcvt12.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+define i32 @f1(i32 %a, i32 %b, i32 %c) {
+; CHECK: f1:
+; CHECK: mlsne r0, r0, r1, r2
+    %tmp1 = icmp eq i32 %a, 0
+    br i1 %tmp1, label %cond_false, label %cond_true
+
+cond_true:
+    %tmp2 = mul i32 %a, %b
+    %tmp3 = sub i32 %c, %tmp2
+    ret i32 %tmp3
+
+cond_false:
+    ret i32 %a
+}
diff --git a/test/CodeGen/ARM/ifcvt5.ll b/test/CodeGen/ARM/ifcvt5.ll
index 95f5c97f2a9ae..5081791bc257f 100644
--- a/test/CodeGen/ARM/ifcvt5.ll
+++ b/test/CodeGen/ARM/ifcvt5.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+; rdar://8402126
 
 @x = external global i32*		; <i32**> [#uses=1]
 
@@ -10,8 +12,12 @@ entry:
 }
 
 define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
-; CHECK: poplt {r7, pc}
+; A8: t1:
+; A8: poplt {r7, pc}
+
+; SWIFT: t1:
+; SWIFT: pop {r7, pc}
+; SWIFT: pop {r7, pc}
 entry:
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
diff --git a/test/CodeGen/ARM/indirectbr-2.ll b/test/CodeGen/ARM/indirectbr-2.ll
new file mode 100644
index 0000000000000..084f520a8ee57
--- /dev/null
+++ b/test/CodeGen/ARM/indirectbr-2.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -O0 -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s
+; <rdar://problem/12529625>
+
+@foo = global i32 34879, align 4
+@DWJumpTable2808 = global [2 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@func, %14) to i32), i32 ptrtoint (i8* blockaddress(@func, %4) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@func, %13) to i32), i32 ptrtoint (i8* blockaddress(@func, %4) to i32))]
+@0 = internal constant [45 x i8] c"func XXXXXXXXXXX :: bb xxxxxxxxxxxxxxxxxxxx\0A\00"
+
+; The indirect branch has the two destinations as successors. The lone PHI
+; statement shouldn't be implicitly defined.
+
+; CHECK:      func:
+; CHECK:      Ltmp1:    @ Block address taken
+; CHECK-NOT:            @ implicit-def: R0
+; CHECK:                @ 4-byte Reload
+
+define i32 @func() nounwind ssp {
+  %1 = alloca i32, align 4
+  %2 = load i32* @foo, align 4
+  %3 = icmp eq i32 %2, 34879
+  br label %4
+
+; <label>:4                                       ; preds = %0
+  %5 = zext i1 %3 to i32
+  %6 = mul i32 %5, 287
+  %7 = add i32 %6, 2
+  %8 = getelementptr [2 x i32]* @DWJumpTable2808, i32 0, i32 %5
+  %9 = load i32* %8
+  %10 = add i32 %9, ptrtoint (i8* blockaddress(@func, %4) to i32)
+  %11 = inttoptr i32 %10 to i8*
+  %12 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @0, i32 0, i32 0))
+  indirectbr i8* %11, [label %13, label %14]
+
+; <label>:13                                      ; preds = %4
+  %tmp14 = phi i32 [ %7, %4 ]
+  store i32 23958, i32* @foo, align 4
+  %tmp15 = load i32* %1, align 4
+  %tmp16 = icmp eq i32 %tmp15, 0
+  %tmp17 = zext i1 %tmp16 to i32
+  %tmp21 = add i32 %tmp17, %tmp14
+  ret i32 %tmp21
+
+; <label>:14                                      ; preds = %4
+  ret i32 42
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
new file mode 100644
index 0000000000000..1d72afefb5b8d
--- /dev/null
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -0,0 +1,35 @@
+; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+
+; This test checks that when inserting one (integer) element into a vector,
+; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
+; one DPR to another that we check for.
+
+; CHECK: @f
+; CHECK-NOT: vorr d
+; CHECK: vmov.32 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <4 x i32> @f(<4 x i32> %in) {
+  %1 = insertelement <4 x i32> %in, i32 255, i32 3
+  ret <4 x i32> %1
+}
+
+; CHECK: @g
+; CHECK-NOT: vorr d
+; CHECK: vmov.16 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <8 x i16> @g(<8 x i16> %in) {
+  %1 = insertelement <8 x i16> %in, i16 255, i32 7
+  ret <8 x i16> %1
+}
+
+; CHECK: @h
+; CHECK-NOT: vorr d
+; CHECK: vmov.8 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <16 x i8> @h(<16 x i8> %in) {
+  %1 = insertelement <16 x i8> %in, i8 255, i32 15
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index 8ddf025dbf1b5..a6ca434483801 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
 
 ; CHECK: test1:
 ; CHECK: ldr {{.*, \[.*]}}, -r2
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index e904e5fd2cdb5..6c40ad7326b69 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
 
 ; CHECK: test1:
 ; CHECK: ldr {{.*!}}
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
new file mode 100644
index 0000000000000..e4a00e9ac303d
--- /dev/null
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/mls.ll b/test/CodeGen/ARM/mls.ll
index a6cdba4454516..066bf98de651a 100644
--- a/test/CodeGen/ARM/mls.ll
+++ b/test/CodeGen/ARM/mls.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+v6t2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -13,4 +14,15 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
     ret i32 %tmp2
 }
 
+; CHECK: f1:
 ; CHECK: mls	r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r2, r0
+
+; CHECK: f2:
+; CHECK: mul r0, r0, r1
+; CHECK-NEXT: sub r0, r0, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r0, r2
diff --git a/test/CodeGen/ARM/neon-fma.ll b/test/CodeGen/ARM/neon-fma.ll
new file mode 100644
index 0000000000000..d2cca5009d6b7
--- /dev/null
+++ b/test/CodeGen/ARM/neon-fma.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -mcpu=swift | FileCheck %s
+
+; CHECK: test_v2f32
+; CHECK: vfma.f32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+define <2 x float> @test_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+entry:
+  %call = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone
+  ret <2 x float> %call
+}
+
+; CHECK: test_v4f32
+; CHECK: vfma.f32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+entry:
+  %call = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone
+  ret <4 x float> %call
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 944bfe060298f..497619ed746a6 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,10 +1,16 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s --check-prefix=SWIFT
 
 ; CHECK: t1
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vadd.i64 q
-; CHECK: vstmia
+; CHECK: vst1.64
+; SWIFT: t1
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
 define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -16,11 +22,17 @@ entry:
 }
 
 ; CHECK: t2
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vsub.i64 q
 ; CHECK: vmov r0, r1, d
 ; CHECK: vmov r2, r3, d
+; SWIFT: t2
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vsub.i64 q
+; SWIFT: vmov r0, r1, d
+; SWIFT: vmov r2, r3, d
 define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -30,3 +42,18 @@ entry:
 	ret <4 x i32> %3
 }
 
+; Limited alignment.
+; SWIFT: t3
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+define void @t3(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+	%0 = load <2 x i64>* %a, align 8
+	%1 = load <2 x i64>* %b, align 8
+	%2 = add <2 x i64> %0, %1
+	%3 = bitcast <2 x i64> %2 to <4 x i32>
+	store <4 x i32> %3, <4 x i32>* %r, align 8
+	ret void
+}
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index df98e231ccfd2..74c9a21355d71 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: func_4_8
 ; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
   %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
   store <4 x i8> %r, <4 x i8>* %p
@@ -11,7 +11,7 @@ define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
 
 ; CHECK: func_2_16
 ; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
   %r = add <2 x i16> %param, <i16 1, i16 2>
   store <2 x i16> %r, <2 x i16>* %p
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 05794e4ebddb1..6d6586e4f2839 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
 
 %struct.int16x8_t = type { <8 x i16> }
@@ -124,7 +124,6 @@ return1:
 return2:
 ; CHECK:        %return2
 ; CHECK:        vadd.i32
-; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
 ; CHECK:        vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
   %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
@@ -137,7 +136,7 @@ return2:
 
 define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        t5:
-; CHECK:        vldmia
+; CHECK:        vld1.32
 ; How can FileCheck match Q and D registers? We need a lisp interpreter.
 ; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
@@ -243,8 +242,8 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK:        vldr
 ; CHECK-NOT:    vmov d{{.*}}, d16
 ; CHECK:        vmov.i32 d17
-; CHECK-NEXT:   vstmia r0, {d16, d17}
-; CHECK-NEXT:   vstmia r0, {d16, d17}
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
   store <4 x float> %4, <4 x float>* undef, align 16
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 5575566628921..62708ed53d059 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -80,7 +80,7 @@ define double @f7(double %a, double %b) {
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
-; CHECK-NEON-NEXT: addeq.w {{r.*}}, [[R2]]
+; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
 ; CHECK-NEON-NEXT: ldr
 ; CHECK-NEON:      bx
 
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index 26f7cb68901fb..7507808912611 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -9,7 +9,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 
 ; T2: t1:
 ; T2: mvn r0, #-2147483648
-; T2: addle.w r1, r1
+; T2: addle r1, r0
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -23,7 +23,7 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; ARM: mov r0, r1
 
 ; T2: t2:
-; T2: suble.w r1, r1, #10
+; T2: suble r1, #10
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 10
@@ -33,12 +33,12 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: t3:
-; ARM: mvnlt r2, #0
-; ARM: and r0, r2, r3
+; ARM: andge r3, r3, r2
+; ARM: mov r0, r3
 
 ; T2: t3:
-; T2: movlt.w r2, #-1
-; T2: and.w r0, r2, r3
+; T2: andge r3, r2
+; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 -1, i32 %x
   %s = and i32 %z, %y
@@ -47,12 +47,12 @@ define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 
 define i32 @t4(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: t4:
-; ARM: movlt r2, #0
-; ARM: orr r0, r2, r3
+; ARM: orrge r3, r3, r2
+; ARM: mov r0, r3
 
 ; T2: t4:
-; T2: movlt r2, #0
-; T2: orr.w r0, r2, r3
+; T2: orrge r3, r2
+; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 0, i32 %x
   %s = or i32 %z, %y
@@ -81,7 +81,7 @@ define i32 @t6(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 ; T2: t6:
 ; T2-NOT: movge
-; T2: eorlt.w r3, r3, r2
+; T2: eorlt r3, r2
   %cond = icmp slt i32 %a, %b
   %tmp1 = select i1 %cond, i32 %c, i32 0
   %tmp2 = xor i32 %tmp1, %d
@@ -179,3 +179,46 @@ define i32 @t12(i32 %a, i32 %b) nounwind {
   %tmp1 = select i1 %cond, i32 %a, i32 %x
   ret i32 %tmp1
 }
+
+; Handle frame index operands.
+define void @pr13628() nounwind uwtable align 2 {
+  %x3 = alloca i8, i32 256, align 8
+  %x4 = load i8* undef, align 1
+  %x5 = icmp ne i8 %x4, 0
+  %x6 = select i1 %x5, i8* %x3, i8* null
+  call void @bar(i8* %x6) nounwind
+  ret void
+}
+declare void @bar(i8*)
+
+; Fold zext i1 into predicated add
+define i32 @t13(i32 %c, i32 %a) nounwind readnone ssp {
+entry:
+; ARM: t13
+; ARM: cmp r1, #10
+; ARM: addgt r0, r0, #1
+
+; T2: t13
+; T2: cmp r1, #10
+; T2: addgt r0, #1
+  %cmp = icmp sgt i32 %a, 10
+  %conv = zext i1 %cmp to i32
+  %add = add i32 %conv, %c
+  ret i32 %add
+}
+
+; Fold sext i1 into predicated sub
+define i32 @t14(i32 %c, i32 %a) nounwind readnone ssp {
+entry:
+; ARM: t14
+; ARM: cmp r1, #10
+; ARM: subgt r0, r0, #1
+
+; T2: t14
+; T2: cmp r1, #10
+; T2: subgt r0, #1
+  %cmp = icmp sgt i32 %a, 10
+  %conv = sext i1 %cmp to i32
+  %add = add i32 %conv, %c
+  ret i32 %add
+}
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
index 99ba475ad7b1a..e9541c2788039 100644
--- a/test/CodeGen/ARM/struct_byval.ll
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -44,3 +44,47 @@ entry:
 declare i32 @e1(%struct.SmallStruct* nocapture byval %in) nounwind
 declare i32 @e2(%struct.LargeStruct* nocapture byval %in) nounwind
 declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
+
+; rdar://12442472
+; We can't do tail call since address of s is passed to the callee and part of
+; s is in caller's local frame.
+define void @f3(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f3
+; CHECK: bl _consumestruct
+entry:
+  %0 = bitcast %struct.SmallStruct* %s to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+define void @f4(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f4
+; CHECK: bl _consumestruct
+entry:
+  %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
+  %0 = bitcast i32* %addr to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+; We can do tail call here since s is in the incoming argument area.
+define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f5
+; CHECK: b _consumestruct
+entry:
+  %0 = bitcast %struct.SmallStruct* %s to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+define void @f6(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f6
+; CHECK: b _consumestruct
+entry:
+  %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
+  %0 = bitcast i32* %addr to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+declare void @consumestruct(i8* nocapture %structp, i32 %structsize) nounwind
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 6fcbdee30d340..2961b94d2c1e6 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -63,3 +63,24 @@ if.then:
 if.else:
   ret i32 %sub
 }
+
+; If the sub/rsb instruction is predicated, we can't use the flags.
+; <rdar://problem/12263428>
+; Test case from MultiSource/Benchmarks/Ptrdist/bc/number.s
+; CHECK: bc_raise
+; CHECK: rsbeq
+; CHECK: cmp
+define i32 @bc_raise() nounwind ssp {
+entry:
+  %val.2.i = select i1 undef, i32 0, i32 undef
+  %sub.i = sub nsw i32 0, %val.2.i
+  %retval.0.i = select i1 undef, i32 %val.2.i, i32 %sub.i
+  %cmp1 = icmp eq i32 %retval.0.i, 0
+  br i1 %cmp1, label %land.lhs.true, label %if.end11
+
+land.lhs.true:                                    ; preds = %num2long.exit
+  ret i32 17
+
+if.end11:                                         ; preds = %num2long.exit
+  ret i32 23
+}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 474043afc11d4..7f82ca7012619 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm < %s | FileCheck %s
+; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/ARM/subreg-remat.ll b/test/CodeGen/ARM/subreg-remat.ll
index 03ae12c6dea03..455bfce0f2e5f 100644
--- a/test/CodeGen/ARM/subreg-remat.ll
+++ b/test/CodeGen/ARM/subreg-remat.ll
@@ -4,14 +4,14 @@ target triple = "thumbv7-apple-ios"
 ;
 ; The vector %v2 is built like this:
 ;
-;   %vreg6:ssub_1<def> = VMOVSR %vreg0<kill>, pred:14, pred:%noreg, %vreg6<imp-def>; DPR_VFP2:%vreg6 GPR:%vreg0
+;   %vreg6:ssub_1<def> = ...
 ;   %vreg6:ssub_0<def> = VLDRS <cp#0>, 0, pred:14, pred:%noreg; mem:LD4[ConstantPool] DPR_VFP2:%vreg6
 ;
 ; When %vreg6 spills, the VLDRS constant pool load cannot be rematerialized
 ; since it implicitly reads the ssub_1 sub-register.
 ;
 ; CHECK: f1
-; CHECK: vmov    s1, r0
+; CHECK: vmov    d0, r0, r0
 ; CHECK: vldr s0, LCPI
 ; The vector must be spilled:
 ; CHECK: vstr d0,
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 38842a9646ffc..21865f8e4aedc 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -14,4 +14,16 @@ entry:
   unreachable
 }
 
+define void @t2() nounwind {
+entry:
+; INSTR: t2:
+; INSTR: trap
+
+; FUNC: t2:
+; FUNC: bl __trap
+  call void @llvm.debugtrap()
+  unreachable
+}
+
 declare void @llvm.trap() nounwind
+declare void @llvm.debugtrap() nounwind
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 4e227dd5be368..fc2aa1e568e2e 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -4,18 +4,18 @@
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
 ; CHECK:      PR13378:
-; CHECK:        vldmia
+; CHECK:        vld1.32
+; CHECK-NEXT:   vst1.32
+; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
-; CHECK-NEXT:   vstmia
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vst1.32
 
 entry:
-  %0 = load <4 x float>* undef
-  store <4 x float> zeroinitializer, <4 x float>* undef
-  store <4 x float> %0, <4 x float>* undef
+  %0 = load <4 x float>* undef, align 4
+  store <4 x float> zeroinitializer, <4 x float>* undef, align 4
+  store <4 x float> %0, <4 x float>* undef, align 4
   %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
-  store <4 x float> %1, <4 x float>* undef
+  store <4 x float> %1, <4 x float>* undef, align 4
   unreachable
 }
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index 869b92675def6..3064202eb3fe5 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=UNALIGNED
 
 ; rdar://7113725
@@ -59,3 +59,19 @@ entry:
   store double %tmp, double* %b, align 1
   ret void
 }
+
+define void @byte_word_ops(i32* %a, i32* %b) nounwind {
+entry:
+; EXPANDED: byte_word_ops:
+; EXPANDED: ldrb
+; EXPANDED: strb
+
+; UNALIGNED: byte_word_ops:
+; UNALIGNED-NOT: ldrb
+; UNALIGNED: ldr
+; UNALIGNED-NOT: strb
+; UNALIGNED: str
+  %tmp = load i32* %a, align 1
+  store i32 %tmp, i32* %b, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/unaligned_load_store_vector.ll b/test/CodeGen/ARM/unaligned_load_store_vector.ll
new file mode 100644
index 0000000000000..25ae6517937b3
--- /dev/null
+++ b/test/CodeGen/ARM/unaligned_load_store_vector.ll
@@ -0,0 +1,487 @@
+;RUN: llc < %s -march=arm -mattr=+v7 -mattr=+neon | FileCheck %s
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i8> %v1, <8 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i16> %v1, <4 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i32> %v1, <2 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.8
+  %v1 = load  <2 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x float> %v1, <2 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <16 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <16 x i8> %v1, <16 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i16> %v1, <8 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i32> %v1, <4 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i64>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i64> %v1, <2 x i64>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.8
+  %v1 = load  <4 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x float> %v1, <4 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i8> %v1, <8 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i16> %v1, <4 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i32> %v1, <2 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.16
+  %v1 = load  <2 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x float> %v1, <2 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <16 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <16 x i8> %v1, <16 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i16> %v1, <8 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i32> %v1, <4 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i64>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i64> %v1, <2 x i64>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.16
+  %v1 = load  <4 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x float> %v1, <4 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vldr
+  %v1 = load  <8 x i8>* %vi, align 4
+;CHECK: vstr
+  store <8 x i8> %v1, <8 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vldr
+  %v1 = load  <4 x i16>* %vi, align 4
+;CHECK: vstr
+  store <4 x i16> %v1, <4 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vldr
+  %v1 = load  <2 x i32>* %vi, align 4
+;CHECK: vstr
+  store <2 x i32> %v1, <2 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vldr
+  %v1 = load  <2 x float>* %vi, align 4
+;CHECK: vstr
+  store <2 x float> %v1, <2 x float>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.32
+  %v1 = load  <16 x i8>* %vi, align 4
+;CHECK: vst1.32
+  store <16 x i8> %v1, <16 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.32
+  %v1 = load  <8 x i16>* %vi, align 4
+;CHECK: vst1.32
+  store <8 x i16> %v1, <8 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.32
+  %v1 = load  <4 x i32>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x i32> %v1, <4 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.32
+  %v1 = load  <2 x i64>* %vi, align 4
+;CHECK: vst1.32
+  store <2 x i64> %v1, <2 x i64>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.32
+  %v1 = load  <4 x float>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x float> %v1, <4 x float>* %vo, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/vbsl-constant.ll b/test/CodeGen/ARM/vbsl-constant.ll
index f157dbdb970c5..ffda0a51bdd00 100644
--- a/test/CodeGen/ARM/vbsl-constant.ll
+++ b/test/CodeGen/ARM/vbsl-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+neon | FileCheck %s
 
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
@@ -59,8 +59,8 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind
 
 define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
 ;CHECK: v_bslQi8:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
@@ -73,8 +73,8 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 
 define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
 ;CHECK: v_bslQi16:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
@@ -87,8 +87,8 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
 ;CHECK: v_bslQi32:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
@@ -101,9 +101,9 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
 ;CHECK: v_bslQi64:
-;CHECK: vldmia
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
+;CHECK: vld1.64
 ;CHECK: vbsl
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
index 9f3bb4e1030c7..750fb0de5383c 100644
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
 
+; rdar://12471808
+
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
 ;CHECK: vbsl
@@ -103,3 +105,98 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwin
 	%tmp7 = or <2 x i64> %tmp4, %tmp6
 	ret <2 x i64> %tmp7
 }
+
+define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: f1:
+; CHECK: vbsl
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: f2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
+  ret <4 x i16> %vbsl3.i
+}
+
+define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: f3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
+  ret <2 x i32> %vbsl3.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp {
+; CHECK: f4:
+; CHECK: vbsl
+  %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
+  ret <2 x float> %vbsl4.i
+}
+
+define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: g1:
+; CHECK: vbsl
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: g2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: g3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
+  ret <4 x i32> %vbsl3.i
+}
+
+define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp {
+; CHECK: g4:
+; CHECK: vbsl
+  %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind
+  ret <4 x float> %vbsl4.i
+}
+
+define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbsl_s64:
+; CHECK: vbsl d
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
+  ret <1 x i64> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbsl_u64:
+; CHECK: vbsl d
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
+  ret <1 x i64> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbslq_s64:
+; CHECK: vbsl q
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
+  ret <2 x i64> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbslq_u64:
+; CHECK: vbsl q
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
+  ret <2 x i64> %vbsl3.i
+}
+
+declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 05332e4d8c5ba..2cf94d63ca143 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -261,3 +261,73 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
   ret void
 }
+
+define <4 x i32> @tdupi(i32 %x, i32 %y) {
+;CHECK: tdupi
+;CHECK: vdup.32
+  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
+  ret <4 x i32> %4
+}
+
+define <4 x float> @tdupf(float %x, float %y) {
+;CHECK: tdupf
+;CHECK: vdup.32
+  %1 = insertelement <4 x float> undef, float %x, i32 0
+  %2 = insertelement <4 x float> %1, float %x, i32 1
+  %3 = insertelement <4 x float> %2, float %x, i32 2
+  %4 = insertelement <4 x float> %3, float %y, i32 3
+  ret <4 x float> %4
+}
+
+; This test checks that when splatting an element from a vector into another,
+; the value isn't moved out to GPRs first.
+define <4 x i32> @tduplane(<4 x i32> %invec) {
+;CHECK: tduplane
+;CHECK-NOT: vmov {{.*}}, d16[1]
+;CHECK: vdup.32 {{.*}}, d16[1]
+  %in = extractelement <4 x i32> %invec, i32 1
+  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
+  %4 = insertelement <4 x i32> %3, i32 255, i32 3
+  ret <4 x i32> %4
+}
+
+define <2 x float> @check_f32(<4 x float> %v) nounwind {
+;CHECK: check_f32:
+;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+  %x = extractelement <4 x float> %v, i32 3
+  %1 = insertelement  <2 x float> undef, float %x, i32 0
+  %2 = insertelement  <2 x float> %1, float %x, i32 1
+  ret <2 x float> %2
+}
+
+define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
+;CHECK: check_i32:
+;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+  %x = extractelement <4 x i32> %v, i32 3
+  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
+  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
+  ret <2 x i32> %2
+}
+
+define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
+;CHECK: check_i16:
+;CHECK: vdup.16 {{.*}}, d{{..}}[3]
+  %x = extractelement <8 x i16> %v, i32 3
+  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
+  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
+  ret <4 x i16> %2
+}
+
+define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
+;CHECK: check_i8:
+;CHECK: vdup.8 {{.*}}, d{{..}}[3]
+  %x = extractelement <16 x i8> %v, i32 3
+  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
+  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
+  ret <8 x i8> %2
+}
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 8fd3db29197e8..22af797621280 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -62,3 +62,14 @@ define <4 x i8> @i(<4 x i8>* %x) {
   %2 = sdiv <4 x i8> zeroinitializer, %1
   ret <4 x i8> %2
 }
+; CHECK: j:
+define <4 x i32> @j(<4 x i8>* %in) nounwind {
+  ; CHECK: vld1
+  ; CHECK: vmovl.u8
+  ; CHECK: vmovl.u16
+  ; CHECK-NOT: vand
+  %1 = load <4 x i8>* %in, align 4
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index e224bdfe25a52..f404eb8be5b78 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -74,6 +74,39 @@ define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 	ret <16 x i8> %tmp3
 }
 
+define <16 x i8> @test_vextq_undef_op2(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+  ret <8 x i8> %tmp1
+}
+
+
+define <16 x i8> @test_vextq_undef_op2_undef(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2_undef(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 1>
+  ret <8 x i8> %tmp1
+}
+
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 1fc885d61372d..c9ce3b7450b64 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
 
 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
 ;CHECK: vsetQ_lane32:
-;CHECK: vmov.32
+;CHECK: vmov.32 d{{.*}}[1], r1
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/ARM/vselect_imax.ll b/test/CodeGen/ARM/vselect_imax.ll
new file mode 100644
index 0000000000000..f5994046de4bf
--- /dev/null
+++ b/test/CodeGen/ARM/vselect_imax.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; Make sure that ARM backend with NEON handles vselect.
+
+define void @vmax_v4i32(<4 x i32>* %m, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: vcgt.s32 [[QR:q[0-9]+]], [[Q1:q[0-9]+]], [[Q2:q[0-9]+]]
+; CHECK: vbsl [[QR]], [[Q1]], [[Q2]]
+    %cmpres = icmp sgt <4 x i32> %a, %b
+    %maxres = select <4 x i1> %cmpres, <4 x i32> %a,  <4 x i32> %b
+    store <4 x i32> %maxres, <4 x i32>* %m
+    ret void
+}
+
diff --git a/test/CodeGen/CellSPU/icmp16.ll b/test/CodeGen/CellSPU/icmp16.ll
index 2f9b091faea3a..853ae1db160fc 100644
--- a/test/CodeGen/CellSPU/icmp16.ll
+++ b/test/CodeGen/CellSPU/icmp16.ll
@@ -534,7 +534,7 @@ entry:
 define i16 @icmp_slt_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
 ; CHECK:      icmp_slt_immed04_i16:
 ; CHECK:        lr
-; CHECK-NETX:   bi
+; CHECK-NEXT:   bi
 
 entry:
        %A = icmp slt i16 %arg1, 32768
@@ -559,7 +559,7 @@ define i1 @icmp_sle_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwi
 ; CHECK:        ilhu
 ; CHECK:        xorhi
 ; CHECK:        iohl
-; CHECK-NETX:   bi
+; CHECK:   bi
 
 entry:
        %A = icmp sle i16 %arg1, %arg2
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
new file mode 100644
index 0000000000000..802ee2cb05580
--- /dev/null
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure we have the correct weight attached to each successor.
+define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Machine code for function test2:
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !0
+; CHECK: BB#0: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
+; CHECK: BB#4: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: BB#5: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index e9ac8b67493ec..8a6efb620ec0f 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = #7
 ; CHECK: memw(r29 + #0) = r[[T0]]
+; CHECK: r5 = #6
 ; CHECK: r0 = #1
 ; CHECK: r1 = #2
 ; CHECK: r2 = #3
 ; CHECK: r3 = #4
 ; CHECK: r4 = #5
-; CHECK: r5 = #6
 
 
 define void @foo() nounwind {
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index ab69b22df57c3..186e39378854f 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; Check that we generate new value store packet in V4
 
 @i = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
new file mode 100644
index 0000000000000..79b5f4ae7c43d
--- /dev/null
+++ b/test/CodeGen/Hexagon/remove_lsr.ll
@@ -0,0 +1,80 @@
+; Test fix for PR-13709.
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: foo
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+
+; Convert the sequence
+; r17:16 = lsr(r11:10, #32)
+; .. = r16
+; into
+; r17:16 = lsr(r11:10, #32)
+; .. = r11
+; This makes the lsr instruction dead and it gets removed subsequently
+; by a dead code removal pass.
+
+%union.vect64 = type { i64 }
+%union.vect32 = type { i32 }
+
+define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
+ %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
+ i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
+ nounwind {
+entry:
+  %scevgep = getelementptr %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
+  %scevgep28 = getelementptr %union.vect32* %s_odd, i32 1
+  %scevgep32 = getelementptr %union.vect32* %s_even, i32 1
+  %scevgep36 = getelementptr i8* %scr_s_odd_code_ptr, i32 1
+  %scevgep39 = getelementptr i8* %scr_s_even_code_ptr, i32 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
+  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
+  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
+  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
+  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
+  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
+  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
+  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
+  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
+  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
+  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
+  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
+  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
+  %conv3 = sext i8 %predicate.022 to i32
+  %1 = trunc i64 %val.021 to i32
+  %2 = trunc i64 %0 to i32
+  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
+  store i32 %3, i32* %lsr.iv3335, align 4, !tbaa !0
+  %conv8 = sext i8 %predicate_1.023 to i32
+  %4 = lshr i64 %val.021, 32
+  %5 = trunc i64 %4 to i32
+  %6 = lshr i64 %0, 32
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
+  store i32 %8, i32* %lsr.iv2931, align 4, !tbaa !0
+  %srcval = load i64* %lsr.iv27, align 8
+  %9 = load i8* %lsr.iv40, align 1, !tbaa !1
+  %10 = load i8* %lsr.iv37, align 1, !tbaa !1
+  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
+  %exitcond = icmp eq i8 %lftr.wideiv, 32
+  %scevgep26 = getelementptr %union.vect64* %lsr.iv, i32 1
+  %scevgep30 = getelementptr %union.vect32* %lsr.iv29, i32 1
+  %scevgep34 = getelementptr %union.vect32* %lsr.iv33, i32 1
+  %scevgep38 = getelementptr i8* %lsr.iv37, i32 1
+  %scevgep41 = getelementptr i8* %lsr.iv40, i32 1
+  %lsr.iv.next = add i32 %lsr.iv42, 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index 2e4ab633e4157..683a4c21bcb86 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
new file mode 100644
index 0000000000000..c3273eff05cb9
--- /dev/null
+++ b/test/CodeGen/MSP430/fp.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 -disable-fp-elim < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+define void @fp() nounwind {
+entry:
+; CHECK: fp:
+; CHECK: push.w r4
+; CHECK: mov.w r1, r4
+; CHECK: sub.w #2, r1
+  %i = alloca i16, align 2
+; CHECK: mov.w #0, -2(r4)
+  store i16 0, i16* %i, align 2
+; CHECK: pop.w r4
+  ret void
+}
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
new file mode 100644
index 0000000000000..731edae43cbb3
--- /dev/null
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -0,0 +1,75 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 25, align 4
+@jjjj = global i32 35, align 4
+@kkkk = global i32 100, align 4
+@t = global i32 25, align 4
+@riii = common global i32 0, align 4
+@rjjj = common global i32 0, align 4
+@rkkk = common global i32 0, align 4
+
+define void @temp(i32 %foo) nounwind {
+entry:
+  %foo.addr = alloca i32, align 4
+  store i32 %foo, i32* %foo.addr, align 4
+  %0 = load i32* %foo.addr, align 4
+  store i32 %0, i32* @t, align 4
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+; 16: 	.frame	$16,24,$ra
+; 16: 	save 	$ra, $s0, $s1, 24
+; 16: 	move	$16, $sp
+; 16:	move	${{[0-9]+}}, $sp
+; 16:	subu	$[[REGISTER:[0-9]+]], ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$sp, $[[REGISTER]]
+  %sssi = alloca i32, align 4
+  %ip = alloca i32*, align 4
+  %sssj = alloca i32, align 4
+  %0 = load i32* @iiii, align 4
+  store i32 %0, i32* %sssi, align 4
+  %1 = load i32* @kkkk, align 4
+  %mul = mul nsw i32 %1, 100
+  %2 = alloca i8, i32 %mul
+  %3 = bitcast i8* %2 to i32*
+  store i32* %3, i32** %ip, align 4
+  %4 = load i32* @jjjj, align 4
+  store i32 %4, i32* %sssj, align 4
+  %5 = load i32* @jjjj, align 4
+  %6 = load i32* @iiii, align 4
+  %7 = load i32** %ip, align 4
+  %arrayidx = getelementptr inbounds i32* %7, i32 %6
+  store i32 %5, i32* %arrayidx, align 4
+  %8 = load i32* @kkkk, align 4
+  %9 = load i32* @jjjj, align 4
+  %10 = load i32** %ip, align 4
+  %arrayidx1 = getelementptr inbounds i32* %10, i32 %9
+  store i32 %8, i32* %arrayidx1, align 4
+  %11 = load i32* @iiii, align 4
+  %12 = load i32* @kkkk, align 4
+  %13 = load i32** %ip, align 4
+  %arrayidx2 = getelementptr inbounds i32* %13, i32 %12
+  store i32 %11, i32* %arrayidx2, align 4
+  %14 = load i32** %ip, align 4
+  %arrayidx3 = getelementptr inbounds i32* %14, i32 25
+  %15 = load i32* %arrayidx3, align 4
+  store i32 %15, i32* @riii, align 4
+  %16 = load i32** %ip, align 4
+  %arrayidx4 = getelementptr inbounds i32* %16, i32 35
+  %17 = load i32* %arrayidx4, align 4
+  store i32 %17, i32* @rjjj, align 4
+  %18 = load i32** %ip, align 4
+  %arrayidx5 = getelementptr inbounds i32* %18, i32 100
+  %19 = load i32* %arrayidx5, align 4
+  store i32 %19, i32* @rkkk, align 4
+  %20 = load i32* @t, align 4
+  %21 = load i32** %ip, align 4
+  %arrayidx6 = getelementptr inbounds i32* %21, i32 %20
+  %22 = load i32* %arrayidx6, align 4
+; 16: 	save	16
+  call void @temp(i32 %22)
+; 16: 	restore	16
+  ret void
+}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 050689dcea6c3..819f258c2a404 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel --disable-machine-licm < %s | FileCheck %s
 
 @x = common global i32 0, align 4
 
@@ -181,8 +181,9 @@ entry:
 
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R18:[0-9]+]], $[[R9]], $[[R6]]
 ; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R9]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; CHECK:   sc      $[[R14]], 0($[[R2]])
 ; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
 
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
new file mode 100644
index 0000000000000..b9c3804e0d720
--- /dev/null
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@.str = private unnamed_addr constant [8 x i8] c"%d, %d\0A\00", align 1
+
+define i32 @foo(i32* %mem, i32 %val, i32 %c) nounwind {
+entry:
+  %0 = atomicrmw add i32* %mem, i32 %val seq_cst
+  %add = add nsw i32 %0, %c
+  ret i32 %add
+; 16: foo:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %x = alloca i32, align 4
+  store volatile i32 0, i32* %x, align 4
+  %0 = atomicrmw add i32* %x, i32 1 seq_cst
+  %add.i = add nsw i32 %0, 2
+  %1 = load volatile i32* %x, align 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
+  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst
+  %3 = load volatile i32* %x, align 4
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
+  %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
+  %5 = load volatile i32* %x, align 4
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %4, i32 %5) nounwind
+; 16: main:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_val_compare_and_swap_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_lock_test_and_set_4)(${{[0-9]+}})
+
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+
diff --git a/test/CodeGen/Mips/brconeq.ll b/test/CodeGen/Mips/brconeq.ll
new file mode 100644
index 0000000000000..613391557efd6
--- /dev/null
+++ b/test/CodeGen/Mips/brconeq.ll
@@ -0,0 +1,38 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, %1
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/Mips/brconeqk.ll b/test/CodeGen/Mips/brconeqk.ll
new file mode 100644
index 0000000000000..2c0e72dabd292
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqk.ll
@@ -0,0 +1,22 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 10
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconeqz.ll b/test/CodeGen/Mips/brconeqz.ll
new file mode 100644
index 0000000000000..5586e7b976da4
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqz.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	beqz	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/brconge.ll b/test/CodeGen/Mips/brconge.ll
new file mode 100644
index 0000000000000..02f0a633b3132
--- /dev/null
+++ b/test/CodeGen/Mips/brconge.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp slt i32 %0, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 1, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brcongt.ll b/test/CodeGen/Mips/brcongt.ll
new file mode 100644
index 0000000000000..767b51b21b918
--- /dev/null
+++ b/test/CodeGen/Mips/brcongt.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconle.ll b/test/CodeGen/Mips/brconle.ll
new file mode 100644
index 0000000000000..854b2481c6e6f
--- /dev/null
+++ b/test/CodeGen/Mips/brconle.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 -5, align 4
+@j = global i32 10, align 4
+@k = global i32 -5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp sgt i32 %1, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 0, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconlt.ll b/test/CodeGen/Mips/brconlt.ll
new file mode 100644
index 0000000000000..931a3e8c7ba49
--- /dev/null
+++ b/test/CodeGen/Mips/brconlt.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconne.ll b/test/CodeGen/Mips/brconne.ll
new file mode 100644
index 0000000000000..5d5bde3fcf915
--- /dev/null
+++ b/test/CodeGen/Mips/brconne.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnek.ll b/test/CodeGen/Mips/brconnek.ll
new file mode 100644
index 0000000000000..6208d7c5a04b5
--- /dev/null
+++ b/test/CodeGen/Mips/brconnek.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll
new file mode 100644
index 0000000000000..47db7901b5172
--- /dev/null
+++ b/test/CodeGen/Mips/brconnez.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 0, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	bnez	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index b266ce61a8d16..2fdb736dc8862 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -1,15 +1,37 @@
-; RUN: llc -march=mipsel  -enable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -check-prefix=None
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=Default
 
 define void @foo1() nounwind {
 entry:
-; CHECK:      jalr 
-; CHECK-NOT:  nop 
-; CHECK:      jr 
-; CHECK-NOT:  nop
-; CHECK:      .end
+; Default:     jalr 
+; Default-NOT: nop 
+; Default:     jr 
+; Default-NOT: nop
+; Default:     .end
+; None: jalr 
+; None: nop 
+; None: jr 
+; None: nop
+; None: .end
 
   tail call void @foo2(i32 3) nounwind
   ret void
 }
 
 declare void @foo2(i32)
+
+; Check that cvt.d.w goes into jalr's delay slot.
+;
+define void @foo3(i32 %a) nounwind {
+entry:
+; Default:     foo3:
+; Default:     jalr
+; Default:     cvt.d.w
+
+  %conv = sitofp i32 %a to double
+  tail call void @foo4(double %conv) nounwind
+  ret void
+}
+
+declare void @foo4(double)
+
diff --git a/test/CodeGen/Mips/brind.ll b/test/CodeGen/Mips/brind.ll
new file mode 100644
index 0000000000000..4c591fa1bba1b
--- /dev/null
+++ b/test/CodeGen/Mips/brind.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@main.L = internal unnamed_addr constant [5 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* blockaddress(@main, %L3), i8* blockaddress(@main, %L4), i8* null], align 4
+@str = private unnamed_addr constant [2 x i8] c"A\00"
+@str5 = private unnamed_addr constant [2 x i8] c"B\00"
+@str6 = private unnamed_addr constant [2 x i8] c"C\00"
+@str7 = private unnamed_addr constant [2 x i8] c"D\00"
+@str8 = private unnamed_addr constant [2 x i8] c"E\00"
+
+define i32 @main() nounwind {
+entry:
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str, i32 0, i32 0))
+  br label %L1
+
+L1:                                               ; preds = %entry, %L3
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %L3 ]
+  %puts5 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str5, i32 0, i32 0))
+  br label %L2
+
+L2:                                               ; preds = %L1, %L3
+  %i.1 = phi i32 [ %i.0, %L1 ], [ %inc, %L3 ]
+  %puts6 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str6, i32 0, i32 0))
+  br label %L3
+
+L3:                                               ; preds = %L2, %L3
+  %i.2 = phi i32 [ %i.1, %L2 ], [ %inc, %L3 ]
+  %puts7 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str7, i32 0, i32 0))
+  %inc = add i32 %i.2, 1
+  %arrayidx = getelementptr inbounds [5 x i8*]* @main.L, i32 0, i32 %i.2
+  %0 = load i8** %arrayidx, align 4
+  indirectbr i8* %0, [label %L1, label %L2, label %L3, label %L4]
+; 16: 	jrc	 ${{[0-9]+}}
+L4:                                               ; preds = %L3
+  %puts8 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str8, i32 0, i32 0))
+  ret i32 0
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+
diff --git a/test/CodeGen/Mips/check-noat.ll b/test/CodeGen/Mips/check-noat.ll
new file mode 100644
index 0000000000000..bfeff677b34d0
--- /dev/null
+++ b/test/CodeGen/Mips/check-noat.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s 
+
+define void @f() nounwind readnone {
+entry:
+; CHECK: f:
+; CHECK: .set  noat
+; CHECK: .set  at
+
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/div.ll b/test/CodeGen/Mips/div.ll
new file mode 100644
index 0000000000000..00e2c1927459b
--- /dev/null
+++ b/test/CodeGen/Mips/div.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 100, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = sdiv i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  store i32 %div, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/div_rem.ll b/test/CodeGen/Mips/div_rem.ll
new file mode 100644
index 0000000000000..950192eee1694
--- /dev/null
+++ b/test/CodeGen/Mips/div_rem.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+@llll = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = sdiv i32 %0, %1
+  store i32 %div, i32* @kkkk, align 4
+  %rem = srem i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @llll, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/divu.ll b/test/CodeGen/Mips/divu.ll
new file mode 100644
index 0000000000000..b96a439390ca6
--- /dev/null
+++ b/test/CodeGen/Mips/divu.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 100, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = udiv i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  store i32 %div, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/divu_remu.ll b/test/CodeGen/Mips/divu_remu.ll
new file mode 100644
index 0000000000000..a6c1563ac195f
--- /dev/null
+++ b/test/CodeGen/Mips/divu_remu.ll
@@ -0,0 +1,23 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+@llll = common global i32 0, align 4
+
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = udiv i32 %0, %1
+  store i32 %div, i32* @kkkk, align 4
+  %rem = urem i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @llll, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/dsp-r1.ll b/test/CodeGen/Mips/dsp-r1.ll
new file mode 100644
index 0000000000000..c9dc8cfd0be07
--- /dev/null
+++ b/test/CodeGen/Mips/dsp-r1.ll
@@ -0,0 +1,1241 @@
+; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+
+define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr.w
+
+  %1 = tail call i32 @llvm.mips.extr.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv.w
+
+  %1 = tail call i32 @llvm.mips.extr.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_r_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_r.w
+
+  %1 = tail call i32 @llvm.mips.extr.r.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.r.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_s_h1(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_s.h
+
+  %1 = tail call i32 @llvm.mips.extr.s.h(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.s.h(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_rs_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_rs.w
+
+  %1 = tail call i32 @llvm.mips.extr.rs.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.rs.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_rs_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_rs.w
+
+  %1 = tail call i32 @llvm.mips.extr.rs.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_s_h2(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_s.h
+
+  %1 = tail call i32 @llvm.mips.extr.s.h(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_r_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_r.w
+
+  %1 = tail call i32 @llvm.mips.extr.r.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extp1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extp ${{[0-9]+}}
+
+  %1 = tail call i32 @llvm.mips.extp(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extp(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extp2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extpv
+
+  %1 = tail call i32 @llvm.mips.extp(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extpdp1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extpdp ${{[0-9]+}}
+
+  %1 = tail call i32 @llvm.mips.extpdp(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extpdp(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extpdp2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extpdpv
+
+  %1 = tail call i32 @llvm.mips.extpdp(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i64 @test__builtin_mips_dpau_h_qbl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpau.h.qbl
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpau.h.qbl(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpau.h.qbl(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpau_h_qbr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpau.h.qbr
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpau.h.qbr(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpau.h.qbr(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsu_h_qbl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsu.h.qbl
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpsu.h.qbl(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsu.h.qbl(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsu_h_qbr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsu.h.qbr
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpsu.h.qbr(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsu.h.qbr(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpaq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpaq_sa_l_w1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind {
+entry:
+; CHECK: dpaq_sa.l.w
+
+  %1 = tail call i64 @llvm.mips.dpaq.sa.l.w(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.dpaq.sa.l.w(i64, i32, i32) nounwind
+
+define i64 @test__builtin_mips_dpsq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsq_sa_l_w1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind {
+entry:
+; CHECK: dpsq_sa.l.w
+
+  %1 = tail call i64 @llvm.mips.dpsq.sa.l.w(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.dpsq.sa.l.w(i64, i32, i32) nounwind
+
+define i64 @test__builtin_mips_mulsaq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: mulsaq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.mulsaq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.mulsaq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_s_w_phl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_s.w.phl
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.s.w.phl(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.s.w.phl(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_s_w_phr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_s.w.phr
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.s.w.phr(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.s.w.phr(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_sa_w_phl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_sa.w.phl
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.sa.w.phl(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.sa.w.phl(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_sa_w_phr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_sa.w.phr
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.sa.w.phr(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.sa.w.phr(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_shilo1(i32 %i0, i32, i64 %a0) nounwind readnone {
+entry:
+; CHECK: shilo $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.shilo(i64 %a0, i32 0)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.shilo(i64, i32) nounwind readnone
+
+define i64 @test__builtin_mips_shilo2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shilov
+
+  %1 = tail call i64 @llvm.mips.shilo(i64 %a0, i32 %a1)
+  ret i64 %1
+}
+
+define i64 @test__builtin_mips_mthlip1(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mthlip ${{[0-9]+}}
+
+  %1 = tail call i64 @llvm.mips.mthlip(i64 %a0, i32 %a1)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.mthlip(i64, i32) nounwind
+
+define i32 @test__builtin_mips_bposge321(i32 %i0) nounwind readonly {
+entry:
+; CHECK: bposge32 $BB{{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.bposge32()
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.bposge32() nounwind readonly
+
+define i64 @test__builtin_mips_madd1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: madd $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.madd(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.madd(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_maddu1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: maddu $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.maddu(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.maddu(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_msub1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: msub $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.msub(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.msub(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_msubu1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: msubu $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.msubu(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.msubu(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_mult1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: mult $ac{{[0-9]}}
+
+  %0 = tail call i64 @llvm.mips.mult(i32 %a0, i32 %a1)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.mult(i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_multu1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: multu $ac{{[0-9]}}
+
+  %0 = tail call i64 @llvm.mips.multu(i32 %a0, i32 %a1)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.multu(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_addq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addq.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_addq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addq_s.w
+
+  %0 = tail call i32 @llvm.mips.addq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_addu_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.addu.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.addu.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_addu_s_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.addu.s.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.addu.s.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_subq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subq.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_subq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: subq_s.w
+
+  %0 = tail call i32 @llvm.mips.subq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_subu_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subu.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subu.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_subu_s_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subu.s.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subu.s.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_addsc1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addsc ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.addsc(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addsc(i32, i32) nounwind
+
+define i32 @test__builtin_mips_addwc1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addwc ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.addwc(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addwc(i32, i32) nounwind
+
+define i32 @test__builtin_mips_modsub1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: modsub ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.modsub(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.modsub(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_raddu_w_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: raddu.w.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call i32 @llvm.mips.raddu.w.qb(<4 x i8> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.raddu.w.qb(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_muleu_s_ph_qbl1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleu_s.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.muleu.s.ph.qbl(<4 x i8> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.muleu.s.ph.qbl(<4 x i8>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_muleu_s_ph_qbr1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleu_s.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.muleu.s.ph.qbr(<4 x i8> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.muleu.s.ph.qbr(<4 x i8>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mulq_rs_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mulq_rs.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mulq.rs.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mulq.rs.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_muleq_s_w_phl1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleq_s.w.phl
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call i32 @llvm.mips.muleq.s.w.phl(<2 x i16> %0, <2 x i16> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.muleq.s.w.phl(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_muleq_s_w_phr1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleq_s.w.phr
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call i32 @llvm.mips.muleq.s.w.phr(<2 x i16> %0, <2 x i16> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.muleq.s.w.phr(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_precrq_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: precrq.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precrq.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precrq.qb.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precrq_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precrq.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precrq.ph.w(i32 %a0, i32 %a1)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precrq.ph.w(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_precrq_rs_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: precrq_rs.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precrq.rs.ph.w(i32 %a0, i32 %a1)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precrq.rs.ph.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_precrqu_s_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: precrqu_s.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precrqu.s.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precrqu.s.qb.ph(<2 x i16>, <2 x i16>) nounwind
+
+
+define i32 @test__builtin_mips_cmpu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+declare i32 @llvm.mips.rddsp(i32) nounwind readonly
+
+define i32 @test__builtin_mips_cmpu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmp_eq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.eq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.eq.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.eq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmp_lt_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.lt.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.lt.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.lt.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmp_le_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.le.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.le.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.le.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_pick_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readonly {
+entry:
+; CHECK: pick.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.pick.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.pick.qb(<4 x i8>, <4 x i8>) nounwind readonly
+
+define { i32 } @test__builtin_mips_pick_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readonly {
+entry:
+; CHECK: pick.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.pick.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.pick.ph(<2 x i16>, <2 x i16>) nounwind readonly
+
+define { i32 } @test__builtin_mips_packrl_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: packrl.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.packrl.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.packrl.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_rddsp1(i32 %i0) nounwind readonly {
+entry:
+; CHECK: rddsp ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_shll_qb1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shll.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shll.qb(<4 x i8>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shll.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shll_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shll.ph(<2 x i16>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shll_s_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.s.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shll.s.ph(<2 x i16>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_s_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.s.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_shll_s_w1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: shll_s.w
+
+  %0 = tail call i32 @llvm.mips.shll.s.w(i32 %a0, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.shll.s.w(i32, i32) nounwind
+
+define i32 @test__builtin_mips_shll_s_w2(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: shllv_s.w
+
+  %0 = tail call i32 @llvm.mips.shll.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_shrl_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shrl.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shrl.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shrl.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shrl_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrlv.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shrl.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shra.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_r_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.r.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shra.r.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_r_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.r.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_shra_r_w1(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: shra_r.w
+
+  %0 = tail call i32 @llvm.mips.shra.r.w(i32 %a0, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.shra.r.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_shra_r_w2(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.w
+
+  %0 = tail call i32 @llvm.mips.shra.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_absq_s_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: absq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.absq.s.ph(<2 x i16> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.absq.s.ph(<2 x i16>) nounwind
+
+define i32 @test__builtin_mips_absq_s_w1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: absq_s.w
+
+  %0 = tail call i32 @llvm.mips.absq.s.w(i32 %a0)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.absq.s.w(i32) nounwind
+
+define i32 @test__builtin_mips_preceq_w_phl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceq.w.phl
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call i32 @llvm.mips.preceq.w.phl(<2 x i16> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.preceq.w.phl(<2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_preceq_w_phr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceq.w.phr
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call i32 @llvm.mips.preceq.w.phr(<2 x i16> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.preceq.w.phr(<2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbl(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbl(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbr(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbr(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbla1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbla
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbla(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbla(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbra1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbra
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbra(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbra(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbl(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbl(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbr(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbr(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbla1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbla
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbla(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbla(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbra1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbra
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbra(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbra(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_qb1(i32 %i0) nounwind readnone {
+entry:
+; CHECK: repl.qb
+
+  %0 = tail call <4 x i8> @llvm.mips.repl.qb(i32 127)
+  %1 = bitcast <4 x i8> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.repl.qb(i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_qb2(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: replv.qb
+
+  %0 = tail call <4 x i8> @llvm.mips.repl.qb(i32 %a0)
+  %1 = bitcast <4 x i8> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_repl_ph1(i32 %i0) nounwind readnone {
+entry:
+; CHECK: repl.ph
+
+  %0 = tail call <2 x i16> @llvm.mips.repl.ph(i32 0)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.repl.ph(i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_ph2(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: replv.ph
+
+  %0 = tail call <2 x i16> @llvm.mips.repl.ph(i32 %a0)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_bitrev1(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: bitrev ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.bitrev(i32 %a0)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.bitrev(i32) nounwind readnone
+
+define i32 @test__builtin_mips_lbux1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lbux ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lbux(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lbux(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_lhx1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lhx ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lhx(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lhx(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_lwx1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lwx ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lwx(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lwx(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_wrdsp1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: wrdsp ${{[0-9]+}}
+
+  tail call void @llvm.mips.wrdsp(i32 %a0, i32 31)
+  %0 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %0
+}
+
+declare void @llvm.mips.wrdsp(i32, i32) nounwind
diff --git a/test/CodeGen/Mips/dsp-r2.ll b/test/CodeGen/Mips/dsp-r2.ll
new file mode 100644
index 0000000000000..631f9e43c23ac
--- /dev/null
+++ b/test/CodeGen/Mips/dsp-r2.ll
@@ -0,0 +1,568 @@
+; RUN: llc -march=mipsel -mattr=+dspr2 < %s | FileCheck %s
+
+define i64 @test__builtin_mips_dpa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dps_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dps.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dps.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dps.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_mulsa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: mulsa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.mulsa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.mulsa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpax_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpax.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpax.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpax.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsx_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsx.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsx.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsx.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpaqx_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaqx_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaqx.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaqx.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpaqx_sa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaqx_sa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaqx.sa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaqx.sa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsqx_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsqx_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsqx.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsqx.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsqx_sa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsqx_sa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsqx.sa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsqx.sa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addu_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addu.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addu.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addu_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addu.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addu.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mulq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mulq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mulq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mulq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subu_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subu.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subu.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subu_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subu.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subu.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_precr_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: precr.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precr.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precr.qb.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_precr_sra_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precr_sra.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precr.sra.ph.w(i32 %a0, i32 %a1, i32 15)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precr.sra.ph.w(i32, i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_precr_sra_r_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precr_sra_r.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precr.sra.r.ph.w(i32 %a0, i32 %a1, i32 15)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precr.sra.r.ph.w(i32, i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shra.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_r_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.r.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shra.r.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_r_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.r.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shrl_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shrl.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shrl.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shrl.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shrl_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrlv.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shrl.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_absq_s_qb1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: absq_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.absq.s.qb(<4 x i8> %0)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.absq.s.qb(<4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_mul_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mul.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mul.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mul.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mul_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mul_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mul.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mul.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_mulq_rs_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mulq_rs.w
+
+  %0 = tail call i32 @llvm.mips.mulq.rs.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.mulq.rs.w(i32, i32) nounwind
+
+define i32 @test__builtin_mips_mulq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mulq_s.w
+
+  %0 = tail call i32 @llvm.mips.mulq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.mulq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_adduh_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: adduh.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.adduh.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.adduh.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_adduh_r_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: adduh_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.adduh.r.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.adduh.r.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subuh_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subuh.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subuh.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subuh.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subuh_r_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subuh_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subuh.r.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subuh.r.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_addqh_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: addqh.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addqh.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addqh.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_addqh_r_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: addqh_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addqh.r.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addqh.r.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_addqh_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: addqh.w
+
+  %0 = tail call i32 @llvm.mips.addqh.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addqh.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_addqh_r_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: addqh_r.w
+
+  %0 = tail call i32 @llvm.mips.addqh.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addqh.r.w(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_subqh_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subqh.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subqh.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subqh.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subqh_r_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subqh_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subqh.r.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subqh.r.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_subqh_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: subqh.w
+
+  %0 = tail call i32 @llvm.mips.subqh.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subqh.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_subqh_r_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: subqh_r.w
+
+  %0 = tail call i32 @llvm.mips.subqh.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subqh.r.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_append1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: append ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.append(i32 %a0, i32 %a1, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.append(i32, i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_balign1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: balign ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.balign(i32 %a0, i32 %a1, i32 1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.balign(i32, i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_prepend1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: prepend ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.prepend(i32 %a0, i32 %a1, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.prepend(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
new file mode 100644
index 0000000000000..3a21332b5c5a3
--- /dev/null
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
+; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+define i8* @f1() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK:        addiu   $sp, $sp, -32
+; CHECK:        addiu   $2, $sp, 32
+}
+
+
+define i8* @f2() nounwind {
+entry:
+  %x = alloca [65536 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; check stack size (65536 + 8)
+; CHECK:        lui     $[[R0:[a-z0-9]+]], 65535
+; CHECK:        addiu   $[[R0]], $[[R0]], -8
+; CHECK:        addu    $sp, $sp, $[[R0]]
+
+; check return value ($sp + stack size)
+; CHECK:        lui     $[[R1:[a-z0-9]+]], 1
+; CHECK:        addu    $[[R1]], $sp, $[[R1]]
+; CHECK:        addiu   $2, $[[R1]], 8
+}
+
+
+define i32 @f3() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = ptrtoint i8* %0 to i32
+  %2 = call i8* @llvm.frameaddress(i32 0)
+  %3 = ptrtoint i8* %2 to i32
+  %add = add i32 %1, %3
+  ret i32 %add
+
+; CHECK:        addiu   $sp, $sp, -40
+
+; check return value ($fp + stack size + $fp)
+; CHECK:        addiu   $[[R0:[a-z0-9]+]], $fp, 40
+; CHECK:        addu    $2, $[[R0]], $fp
+}
+
+
+define i8* @f4() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK-MIPS64:        daddiu   $sp, $sp, -32
+; CHECK-MIPS64:        daddiu   $2, $sp, 32
+}
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index bee93accd4284..aee58b650e7ac 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -24,10 +24,10 @@ entry:
 ; C1:	addiu	${{[0-9]+}}, %lo($.str)
 ; C2:	move	$25, ${{[0-9]+}}
 ; C1:	move 	$gp, ${{[0-9]+}}
-; C1:	jalr 	${{[0-9]+}}
+; C1:	jalrc 	${{[0-9]+}}
 ; SR:	restore 	$ra, [[FS]]
 ; PE:	li	$2, 0
-; PE:	jr 	$ra
+; PE:	jrc 	$ra
 
 }
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
new file mode 100644
index 0000000000000..c6da8b1ac9a03
--- /dev/null
+++ b/test/CodeGen/Mips/i32k.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16a
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16b
+
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 29905
+; 16b:	li	${{[0-9]+}}, 16408
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 49127
+; 16b:	li	${{[0-9]+}}, 35631
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
new file mode 100644
index 0000000000000..f96ce2647289f
--- /dev/null
+++ b/test/CodeGen/Mips/init-array.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+
+target triple = "mipsel-unknown-linux"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @test }]
+; CHECK: .section
+; CHECK: .init_array
+; CHECK-NOT: .ctors
+; CHECK: .4byte test
+
+define internal void @test() section ".text.startup" {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/Mips/largeimm1.ll b/test/CodeGen/Mips/largeimm1.ll
index d65cc025d085b..1c0f69c590114 100644
--- a/test/CodeGen/Mips/largeimm1.ll
+++ b/test/CodeGen/Mips/largeimm1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s
 
-; CHECK: lui $at, 49152
-; CHECK: lui $at, 16384
+; CHECK: lui ${{[0-9]+}}, 49152
+; CHECK: lui ${{[0-9]+}}, 16384
 define void @f() nounwind {
 entry:
   %a1 = alloca [1073741824 x i8], align 1
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 2e548790cd39d..1e96346d1dd73 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
+; RUN: FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
 
@@ -6,9 +8,21 @@
 
 define void @f() nounwind {
 entry:
-; CHECK:  lui $at, 65535
-; CHECK:  addiu $at, $at, -16
-; CHECK:  addu  $sp, $sp, $at
+; 32:  lui $[[R0:[0-9]+]], 65535
+; 32:  addiu $[[R0]], $[[R0]], -24
+; 32:  addu $sp, $sp, $[[R0]]
+; 32:  lui $[[R1:[0-9]+]], 1
+; 32:  addu $[[R1]], $sp, $[[R1]]
+; 32:  sw $ra, 20($[[R1]])
+; 64:  daddiu  $[[R0:[0-9]+]], $zero, 1
+; 64:  dsll  $[[R0]], $[[R0]], 48
+; 64:  daddiu  $[[R0]], $[[R0]], -1
+; 64:  dsll  $[[R0]], $[[R0]], 16
+; 64:  daddiu  $[[R0]], $[[R0]], -48
+; 64:  daddu $sp, $sp, $[[R0]]
+; 64:  lui $[[R1:[0-9]+]], 1
+; 64:  daddu $[[R1]], $sp, $[[R1]]
+; 64:  sd  $ra, 40($[[R1]])
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
new file mode 100644
index 0000000000000..7763daec3b32b
--- /dev/null
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -0,0 +1,51 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i64 4294967295, align 8
+@j = global i64 15, align 8
+@ii = global i64 4294967295, align 8
+@k = common global i64 0, align 8
+@l = common global i64 0, align 8
+@m = common global i64 0, align 8
+
+define void @test1() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %add = add nsw i64 %1, %0
+  store i64 %add, i64* @k, align 8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
+
+define void @test2() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %sub = sub nsw i64 %0, %1
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %sub, i64* @l, align 8
+  ret void
+}
+
+define void @test3() nounwind {
+entry:
+  %0 = load i64* @ii, align 8
+  %add = add nsw i64 %0, 15
+; 16:	addiu	${{[0-9]+}}, 15
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %add, i64* @m, align 8
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 0227b88fbc868..1a4f79c191e14 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -6,9 +6,15 @@
 define void @foo1(i32 %s) nounwind {
 entry:
 ; O32: bal
+; O32: lui $1, 0
+; O32: addiu $1, $1, {{[0-9]+}} 
+; N64: lui $1, 0
+; N64: daddiu $1, $1, 0
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, 0
 ; N64: bal
-; N64: highest
-; N64: higher
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, {{[0-9]+}}  
 
   %tobool = icmp eq i32 %s, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
new file mode 100644
index 0000000000000..e26b0223b447f
--- /dev/null
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O3 < %s | FileCheck %s
+
+%struct.S = type { [8 x i32] }
+
+@g = common global %struct.S zeroinitializer, align 4
+
+define void @f(%struct.S* noalias sret %agg.result) nounwind {
+entry:
+; CHECK: daddu $2, $zero, $4
+
+  %0 = bitcast %struct.S* %agg.result to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/misha.ll b/test/CodeGen/Mips/misha.ll
new file mode 100644
index 0000000000000..80637edb16746
--- /dev/null
+++ b/test/CodeGen/Mips/misha.ll
@@ -0,0 +1,69 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+define i32 @sumc(i8* nocapture %to, i8* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i8* %to, align 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i8 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i8* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8* %from.addr.09, i32 1
+  %2 = load i8* %from.addr.09, align 1
+  %conv27 = zext i8 %2 to i32
+  %conv36 = zext i8 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i8
+  store i8 %conv4, i8* %to, align 1
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+; 16: sumc:
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: sum:
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+define i32 @sum(i16* nocapture %to, i16* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i16* %to, align 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i16 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i16* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i16* %from.addr.09, i32 1
+  %2 = load i16* %from.addr.09, align 2
+  %conv27 = zext i16 %2 to i32
+  %conv36 = zext i16 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i16
+  store i16 %conv4, i16* %to, align 2
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+
diff --git a/test/CodeGen/Mips/mul.ll b/test/CodeGen/Mips/mul.ll
new file mode 100644
index 0000000000000..4ce801b1c9f43
--- /dev/null
+++ b/test/CodeGen/Mips/mul.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 5, align 4
+@jjjj = global i32 -6, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %mul = mul nsw i32 %1, %0
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+
+  store i32 %mul, i32* @kkkk, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/mulll.ll b/test/CodeGen/Mips/mulll.ll
new file mode 100644
index 0000000000000..e37b9197df828
--- /dev/null
+++ b/test/CodeGen/Mips/mulll.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i64 5, align 8
+@jjjj = global i64 -6, align 8
+@kkkk = common global i64 0, align 8
+
+define void @test() nounwind {
+entry:
+  %0 = load i64* @iiii, align 8
+  %1 = load i64* @jjjj, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* @kkkk, align 8
+; 16:	multu	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+
+  ret void
+}
diff --git a/test/CodeGen/Mips/mulull.ll b/test/CodeGen/Mips/mulull.ll
new file mode 100644
index 0000000000000..4d23c693184b7
--- /dev/null
+++ b/test/CodeGen/Mips/mulull.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i64 5, align 8
+@jjjj = global i64 6, align 8
+@kkkk = common global i64 0, align 8
+@.str = private unnamed_addr constant [20 x i8] c"%lld * %lld = %lld\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i64* @iiii, align 8
+  %1 = load i64* @jjjj, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* @kkkk, align 8
+; 16:	multu	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
index 7beae99c45574..00c66a9928f62 100644
--- a/test/CodeGen/Mips/null.ll
+++ b/test/CodeGen/Mips/null.ll
@@ -8,6 +8,6 @@ entry:
 ; 16: 	.set	mips16                  # @main
 
 
-; 16:	jr	$ra
+; 16:	jrc	$ra
 
 }
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index eac0d80c1c570..5558ba6e10f47 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -119,6 +119,16 @@ entry:
   ret void
 }
 
+%struct.S4 = type { [4 x i32] }
+
+define void @f5(i64 %a0, %struct.S4* nocapture byval %a1) nounwind {
+entry:
+  tail call void @f6(%struct.S4* byval %a1, i64 %a0) nounwind
+  ret void
+}
+
+declare void @f6(%struct.S4* nocapture byval, i64)
+
 !0 = metadata !{metadata !"int", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Mips/rem.ll b/test/CodeGen/Mips/rem.ll
new file mode 100644
index 0000000000000..b18f85dcbecfb
--- /dev/null
+++ b/test/CodeGen/Mips/rem.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %rem = srem i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
new file mode 100644
index 0000000000000..d93964bcaef6f
--- /dev/null
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
+
+define void @f0() nounwind {
+entry:
+; 32:  addiu $4, $zero, 1
+; 32:  addiu $4, $zero, 1
+
+  tail call void @foo1(i32 1) nounwind
+  tail call void @foo1(i32 1) nounwind
+  ret void
+}
+
+declare void @foo1(i32)
+
+define void @f3() nounwind {
+entry:
+; 64:  daddiu $4, $zero, 1
+; 64:  daddiu $4, $zero, 1
+
+  tail call void @foo2(i64 1) nounwind
+  tail call void @foo2(i64 1) nounwind
+  ret void
+}
+
+declare void @foo2(i64)
+
+define void @f5() nounwind {
+entry:
+; 32:  lui $4, 1
+; 32:  lui $4, 1
+
+  tail call void @f6(i32 65536) nounwind
+  tail call void @f6(i32 65536) nounwind
+  ret void
+}
+
+declare void @f6(i32)
+
+define void @f7() nounwind {
+entry:
+; 64:  lui $4, 1
+; 64:  lui $4, 1
+
+  tail call void @f8(i64 65536) nounwind
+  tail call void @f8(i64 65536) nounwind
+  ret void
+}
+
+declare void @f8(i64)
+
diff --git a/test/CodeGen/Mips/remu.ll b/test/CodeGen/Mips/remu.ll
new file mode 100644
index 0000000000000..472503c38403d
--- /dev/null
+++ b/test/CodeGen/Mips/remu.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+@.str = private unnamed_addr constant [15 x i8] c"%u = %u %% %u\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %rem = urem i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @kkkk, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/return-vector.ll b/test/CodeGen/Mips/return-vector.ll
new file mode 100644
index 0000000000000..739c43c68a55e
--- /dev/null
+++ b/test/CodeGen/Mips/return-vector.ll
@@ -0,0 +1,244 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+
+; Check that function accesses vector return value from stack in cases when
+; vector can't be returned in registers. Also check that caller passes in
+; register $4 stack address where the vector should be placed.
+
+
+declare <8 x i32>    @i8(...)
+declare <4 x float>  @f4(...)
+declare <4 x double> @d4(...)
+
+define i32 @call_i8() {
+entry:
+  %call = call <8 x i32> (...)* @i8()
+  %v0 = extractelement <8 x i32> %call, i32 0
+  %v1 = extractelement <8 x i32> %call, i32 1
+  %v2 = extractelement <8 x i32> %call, i32 2
+  %v3 = extractelement <8 x i32> %call, i32 3
+  %v4 = extractelement <8 x i32> %call, i32 4
+  %v5 = extractelement <8 x i32> %call, i32 5
+  %v6 = extractelement <8 x i32> %call, i32 6
+  %v7 = extractelement <8 x i32> %call, i32 7
+  %add1 = add i32 %v0, %v1
+  %add2 = add i32 %v2, %v3
+  %add3 = add i32 %v4, %v5
+  %add4 = add i32 %v6, %v7
+  %add5 = add i32 %add1, %add2
+  %add6 = add i32 %add3, %add4
+  %add7 = add i32 %add5, %add6
+  ret i32 %add7
+
+; CHECK:        call_i8:
+; CHECK:        call16(i8)
+; CHECK:        addiu   $4, $sp, 32
+; CHECK:        lw      $[[R0:[a-z0-9]+]], 60($sp)
+; CHECK:        lw      $[[R1:[a-z0-9]+]], 56($sp)
+; CHECK:        lw      $[[R2:[a-z0-9]+]], 52($sp)
+; CHECK:        lw      $[[R3:[a-z0-9]+]], 48($sp)
+; CHECK:        lw      $[[R4:[a-z0-9]+]], 44($sp)
+; CHECK:        lw      $[[R5:[a-z0-9]+]], 40($sp)
+; CHECK:        lw      $[[R6:[a-z0-9]+]], 36($sp)
+; CHECK:        lw      $[[R7:[a-z0-9]+]], 32($sp)
+}
+
+
+define float @call_f4() {
+entry:
+  %call = call <4 x float> (...)* @f4()
+  %v0 = extractelement <4 x float> %call, i32 0
+  %v1 = extractelement <4 x float> %call, i32 1
+  %v2 = extractelement <4 x float> %call, i32 2
+  %v3 = extractelement <4 x float> %call, i32 3
+  %add1 = fadd float %v0, %v1
+  %add2 = fadd float %v2, %v3
+  %add3 = fadd float %add1, %add2
+  ret float %add3
+
+; CHECK:        call_f4:
+; CHECK:        call16(f4)
+; CHECK:        addiu   $4, $sp, 16
+; CHECK:        lwc1    $[[R0:[a-z0-9]+]], 28($sp)
+; CHECK:        lwc1    $[[R1:[a-z0-9]+]], 24($sp)
+; CHECK:        lwc1    $[[R3:[a-z0-9]+]], 20($sp)
+; CHECK:        lwc1    $[[R4:[a-z0-9]+]], 16($sp)
+}
+
+
+define double @call_d4() {
+entry:
+  %call = call <4 x double> (...)* @d4()
+  %v0 = extractelement <4 x double> %call, i32 0
+  %v1 = extractelement <4 x double> %call, i32 1
+  %v2 = extractelement <4 x double> %call, i32 2
+  %v3 = extractelement <4 x double> %call, i32 3
+  %add1 = fadd double %v0, %v1
+  %add2 = fadd double %v2, %v3
+  %add3 = fadd double %add1, %add2
+  ret double %add3
+
+; CHECK:        call_d4:
+; CHECK:        call16(d4)
+; CHECK:        addiu   $4, $sp, 32
+; CHECK:        ldc1    $[[R0:[a-z0-9]+]], 56($sp)
+; CHECK:        ldc1    $[[R1:[a-z0-9]+]], 48($sp)
+; CHECK:        ldc1    $[[R3:[a-z0-9]+]], 40($sp)
+; CHECK:        ldc1    $[[R4:[a-z0-9]+]], 32($sp)
+}
+
+
+
+; Check that function accesses vector return value from registers in cases when
+; vector can be returned in registers
+
+
+declare <4 x i32>    @i4(...)
+declare <2 x float>  @f2(...)
+declare <2 x double> @d2(...)
+
+define i32 @call_i4() {
+entry:
+  %call = call <4 x i32> (...)* @i4()
+  %v0 = extractelement <4 x i32> %call, i32 0
+  %v1 = extractelement <4 x i32> %call, i32 1
+  %v2 = extractelement <4 x i32> %call, i32 2
+  %v3 = extractelement <4 x i32> %call, i32 3
+  %add1 = add i32 %v0, %v1
+  %add2 = add i32 %v2, %v3
+  %add3 = add i32 %add1, %add2
+  ret i32 %add3
+
+; CHECK:        call_i4:
+; CHECK:        call16(i4)
+; CHECK-NOT:    lw
+; CHECK:        addu    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+; CHECK:        addu    $[[R5:[a-z0-9]+]], $[[R3:[a-z0-9]+]], $[[R4:[a-z0-9]+]]
+; CHECK:        addu    $[[R6:[a-z0-9]+]], $[[R5]], $[[R2]]
+}
+
+
+define float @call_f2() {
+entry:
+  %call = call <2 x float> (...)* @f2()
+  %v0 = extractelement <2 x float> %call, i32 0
+  %v1 = extractelement <2 x float> %call, i32 1
+  %add1 = fadd float %v0, %v1
+  ret float %add1
+
+; CHECK:        call_f2:
+; CHECK:        call16(f2)
+; CHECK-NOT:    lwc1
+; CHECK:        add.s    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+}
+
+
+define double @call_d2() {
+entry:
+  %call = call <2 x double> (...)* @d2()
+  %v0 = extractelement <2 x double> %call, i32 0
+  %v1 = extractelement <2 x double> %call, i32 1
+  %add1 = fadd double %v0, %v1
+  ret double %add1
+
+; CHECK:        call_d2:
+; CHECK:        call16(d2)
+; CHECK-NOT:    ldc1
+; CHECK:        add.d    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+}
+
+
+
+; Check that function returns vector on stack in cases when vector can't be
+; returned in registers. Also check that vector is placed on stack starting
+; from the address in register $4.
+
+
+define <8 x i32> @return_i8() {
+entry:
+  ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+; CHECK:        return_i8:
+; CHECK:        sw      $[[R0:[a-z0-9]+]], 28($4)
+; CHECK:        sw      $[[R1:[a-z0-9]+]], 24($4)
+; CHECK:        sw      $[[R2:[a-z0-9]+]], 20($4)
+; CHECK:        sw      $[[R3:[a-z0-9]+]], 16($4)
+; CHECK:        sw      $[[R4:[a-z0-9]+]], 12($4)
+; CHECK:        sw      $[[R5:[a-z0-9]+]], 8($4)
+; CHECK:        sw      $[[R6:[a-z0-9]+]], 4($4)
+; CHECK:        sw      $[[R7:[a-z0-9]+]], 0($4)
+}
+
+
+define <4 x float> @return_f4(float %a, float %b, float %c, float %d) {
+entry:
+  %vecins1 = insertelement <4 x float> undef,    float %a, i32 0
+  %vecins2 = insertelement <4 x float> %vecins1, float %b, i32 1
+  %vecins3 = insertelement <4 x float> %vecins2, float %c, i32 2
+  %vecins4 = insertelement <4 x float> %vecins3, float %d, i32 3
+  ret <4 x float> %vecins4
+
+; CHECK:        return_f4:
+; CHECK:        lwc1    $[[R0:[a-z0-9]+]], 16($sp)
+; CHECK:        swc1    $[[R0]], 12($4)
+; CHECK:        sw      $7, 8($4)
+; CHECK:        sw      $6, 4($4)
+; CHECK:        sw      $5, 0($4)
+}
+
+
+define <4 x double> @return_d4(double %a, double %b, double %c, double %d) {
+entry:
+  %vecins1 = insertelement <4 x double> undef,    double %a, i32 0
+  %vecins2 = insertelement <4 x double> %vecins1, double %b, i32 1
+  %vecins3 = insertelement <4 x double> %vecins2, double %c, i32 2
+  %vecins4 = insertelement <4 x double> %vecins3, double %d, i32 3
+  ret <4 x double> %vecins4
+
+; CHECK:        return_d4:
+; CHECK:        sdc1    $[[R0:[a-z0-9]+]], 24($4)
+; CHECK:        sdc1    $[[R1:[a-z0-9]+]], 16($4)
+; CHECK:        sdc1    $[[R2:[a-z0-9]+]], 8($4)
+; CHECK:        sdc1    $[[R3:[a-z0-9]+]], 0($4)
+}
+
+
+
+; Check that function returns vector in registers in cases when vector can be
+; returned in registers.
+
+
+define <4 x i32> @return_i4() {
+entry:
+  ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+; CHECK:        return_i4:
+; CHECK:        addiu   $2, $zero, 0
+; CHECK:        addiu   $3, $zero, 1
+; CHECK:        addiu   $4, $zero, 2
+; CHECK:        addiu   $5, $zero, 3
+}
+
+
+define <2 x float> @return_f2(float %a, float %b) {
+entry:
+  %vecins1 = insertelement <2 x float> undef,    float %a, i32 0
+  %vecins2 = insertelement <2 x float> %vecins1, float %b, i32 1
+  ret <2 x float> %vecins2
+
+; CHECK:        return_f2:
+; CHECK:        mov.s   $f0, $f12
+; CHECK:        mov.s   $f2, $f14
+}
+
+
+define <2 x double> @return_d2(double %a, double %b) {
+entry:
+  %vecins1 = insertelement <2 x double> undef,    double %a, i32 0
+  %vecins2 = insertelement <2 x double> %vecins1, double %b, i32 1
+  ret <2 x double> %vecins2
+
+; CHECK:        return_d2:
+; CHECK:        mov.d   $f0, $f12
+; CHECK:        mov.d   $f2, $f14
+}
diff --git a/test/CodeGen/Mips/selpat.ll b/test/CodeGen/Mips/selpat.ll
new file mode 100644
index 0000000000000..cda0c96ef4bed
--- /dev/null
+++ b/test/CodeGen/Mips/selpat.ll
@@ -0,0 +1,350 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+
+define void @calc_seleq() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp eq i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_seleqk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 1
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp eq i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp eq i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp eq i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seleqz() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 0
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	beqz	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp eq i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define i32 @calc_selgt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %0, %1
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+  %cmp1 = icmp sgt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret i32 undef
+}
+
+define void @calc_selle() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sle i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sle i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sle i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sle i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selltk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 10
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	slti	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp slt i32 %3, 2
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, 2
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, 2
+  %cond15 = select i1 %cmp11, i32 %2, i32 %1
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_selne() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ne i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnek() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 1
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ne i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp ne i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp ne i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez2() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %tobool, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %tobool1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %tobool1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %tobool6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %tobool6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seluge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp uge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp uge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp uge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp uge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selugt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ugt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ugt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ugt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selule() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ule i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ule i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ule i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ule i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/seteq.ll b/test/CodeGen/Mips/seteq.ll
new file mode 100644
index 0000000000000..da840c83a2b47
--- /dev/null
+++ b/test/CodeGen/Mips/seteq.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 10, align 4
+@k = global i32 1, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp eq i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
+; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/seteqz.ll b/test/CodeGen/Mips/seteqz.ll
new file mode 100644
index 0000000000000..d445be6aedb01
--- /dev/null
+++ b/test/CodeGen/Mips/seteqz.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 0, align 4
+@j = global i32 99, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltiu	${{[0-9]+}}, 1
+; 16:	move	${{[0-9]+}}, $t8
+  %1 = load i32* @j, align 4
+  %cmp1 = icmp eq i32 %1, 99
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
+; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setge.ll b/test/CodeGen/Mips/setge.ll
new file mode 100644
index 0000000000000..94b499bc31e94
--- /dev/null
+++ b/test/CodeGen/Mips/setge.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+@.str = private unnamed_addr constant [22 x i8] c"1 = %i\0A1 = %i\0A0 = %i\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp sge i32 %0, %2
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setgek.ll b/test/CodeGen/Mips/setgek.ll
new file mode 100644
index 0000000000000..b6bae09bcb5b5
--- /dev/null
+++ b/test/CodeGen/Mips/setgek.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@k = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %cmp = icmp sgt i32 %0, -32769
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slti	${{[0-9]+}}, -32768
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/Mips/setle.ll b/test/CodeGen/Mips/setle.ll
new file mode 100644
index 0000000000000..f36fb4392d766
--- /dev/null
+++ b/test/CodeGen/Mips/setle.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp sle i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp sle i32 %2, %1
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setlt.ll b/test/CodeGen/Mips/setlt.ll
new file mode 100644
index 0000000000000..435be8e2334a0
--- /dev/null
+++ b/test/CodeGen/Mips/setlt.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp slt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setltk.ll b/test/CodeGen/Mips/setltk.ll
new file mode 100644
index 0000000000000..c0b610e377849
--- /dev/null
+++ b/test/CodeGen/Mips/setltk.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, 10
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slti	$[[REGISTER:[0-9]+]], 10
+; 16:	move	$[[REGISTER]], $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setne.ll b/test/CodeGen/Mips/setne.ll
new file mode 100644
index 0000000000000..6460c83c7b0b9
--- /dev/null
+++ b/test/CodeGen/Mips/setne.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 10, align 4
+@k = global i32 1, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ne i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	xor	$[[REGISTER:[0-9]+]], ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, $[[REGISTER]]
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setuge.ll b/test/CodeGen/Mips/setuge.ll
new file mode 100644
index 0000000000000..ac72b66e9fb02
--- /dev/null
+++ b/test/CodeGen/Mips/setuge.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp uge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move    $[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp uge i32 %0, %2
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setugt.ll b/test/CodeGen/Mips/setugt.ll
new file mode 100644
index 0000000000000..328f0e3be34a4
--- /dev/null
+++ b/test/CodeGen/Mips/setugt.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move    ${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setule.ll b/test/CodeGen/Mips/setule.ll
new file mode 100644
index 0000000000000..792f2ae0fa29f
--- /dev/null
+++ b/test/CodeGen/Mips/setule.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ule i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp ule i32 %2, %1
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setult.ll b/test/CodeGen/Mips/setult.ll
new file mode 100644
index 0000000000000..56d2e8daa3e0f
--- /dev/null
+++ b/test/CodeGen/Mips/setult.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ult i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setultk.ll b/test/CodeGen/Mips/setultk.ll
new file mode 100644
index 0000000000000..75b270ed84288
--- /dev/null
+++ b/test/CodeGen/Mips/setultk.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp ult i32 %0, 10
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltiu	$[[REGISTER:[0-9]+]], 10
+; 16:	move	$[[REGISTER]], $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
new file mode 100644
index 0000000000000..03503fb2ae18e
--- /dev/null
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: | FileCheck %s
+
+@i = internal unnamed_addr global i32 0, align 4
+
+define i32 @geti() nounwind readonly {
+entry:
+; CHECK: lw ${{[0-9]+}}, %gp_rel(i)($gp)
+  %0 = load i32* @i, align 4
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
new file mode 100644
index 0000000000000..c00c9fd9d2a13
--- /dev/null
+++ b/test/CodeGen/Mips/stchar.ll
@@ -0,0 +1,90 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_h
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_b
+
+@.str = private unnamed_addr constant [9 x i8] c"%hd %c \0A\00", align 1
+@sp = common global i16* null, align 4
+@cp = common global i8* null, align 4
+
+define void @p1(i16 signext %s, i8 signext %c) nounwind {
+entry:
+  %conv = sext i16 %s to i32
+  %conv1 = sext i8 %c to i32
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define void @p2() nounwind {
+entry:
+  %0 = load i16** @sp, align 4
+  %1 = load i16* %0, align 2
+  %2 = load i8** @cp, align 4
+  %3 = load i8* %2, align 1
+  %conv.i = sext i16 %1 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  %4 = load i16** @sp, align 4
+  store i16 32, i16* %4, align 2
+  %5 = load i8** @cp, align 4
+  store i8 97, i8* %5, align 1
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+  %s = alloca i16, align 4
+  %c = alloca i8, align 4
+  store i16 16, i16* %s, align 4
+  store i8 99, i8* %c, align 4
+  store i16* %s, i16** @sp, align 4
+  store i8* %c, i8** @cp, align 4
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %0 = load i16** @sp, align 4
+  store i16 32, i16* %0, align 2
+  %1 = load i8** @cp, align 4
+  store i8 97, i8* %1, align 1
+  %2 = load i16* %s, align 4
+  %3 = load i8* %c, align 4
+  %conv.i = sext i16 %2 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  ret void
+; 16_b: test:
+; 16_h: test:
+; 16_b:	sb	${{[0-9]+}}, [[offset1:[0-9]+]](${{[0-9]+}})
+; 16_b: lb      ${{[0-9]+}}, [[offset1]](${{[0-9]+}})
+; 16_h:	sh	${{[0-9]+}}, [[offset2:[0-9]+]](${{[0-9]+}})
+; 16_h: lh      ${{[0-9]+}}, [[offset2]](${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %s.i = alloca i16, align 4
+  %c.i = alloca i8, align 4
+  %0 = bitcast i16* %s.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %c.i) nounwind
+  store i16 16, i16* %s.i, align 4
+  store i8 99, i8* %c.i, align 4
+  store i16* %s.i, i16** @sp, align 4
+  store i8* %c.i, i8** @cp, align 4
+  %call.i.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %1 = load i16** @sp, align 4
+  store i16 32, i16* %1, align 2
+  %2 = load i8** @cp, align 4
+  store i8 97, i8* %2, align 1
+  %3 = load i16* %s.i, align 4
+  %4 = load i8* %c.i, align 4
+  %conv.i.i = sext i16 %3 to i32
+  %conv1.i.i = sext i8 %4 to i32
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %c.i) nounwind
+  ret i32 0
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
diff --git a/test/CodeGen/Mips/stldst.ll b/test/CodeGen/Mips/stldst.ll
new file mode 100644
index 0000000000000..4182b9e76d63c
--- /dev/null
+++ b/test/CodeGen/Mips/stldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@kkkk = global i32 67, align 4
+@llll = global i32 33, align 4
+@mmmm = global i32 44, align 4
+@nnnn = global i32 55, align 4
+@oooo = global i32 32, align 4
+@pppp = global i32 41, align 4
+@qqqq = global i32 59, align 4
+@rrrr = global i32 60, align 4
+@.str = private unnamed_addr constant [32 x i8] c"%i %i %i %i %i %i %i %i %i %i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @kkkk, align 4
+  %1 = load i32* @llll, align 4
+  %add = add nsw i32 %0, 10
+  %add1 = add nsw i32 %1, 10
+  %2 = load i32* @mmmm, align 4
+  %sub = add nsw i32 %2, -3
+  %3 = load i32* @nnnn, align 4
+  %add2 = add nsw i32 %3, 10
+  %4 = load i32* @oooo, align 4
+  %add3 = add nsw i32 %4, 4
+  %5 = load i32* @pppp, align 4
+  %sub4 = add nsw i32 %5, -5
+  %6 = load i32* @qqqq, align 4
+  %sub5 = add nsw i32 %6, -10
+  %7 = load i32* @rrrr, align 4
+  %add6 = add nsw i32 %7, 6
+
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str, i32 0, i32 0), i32 %sub5, i32 %add6, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) nounwind
+  %call7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str, i32 0, i32 0), i32 %0, i32 %1, i32 %add, i32 %add1, i32 %sub, i32 %add2, i32 %add3, i32 %sub4, i32 %sub5, i32 %add6) nounwind
+  ret i32 0
+}
+; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
+; 16:	lw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Reload
+; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
+; 16:	lw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Reload
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
new file mode 100644
index 0000000000000..bcd33fca70ed6
--- /dev/null
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls < %s | \
+; RUN: FileCheck %s -check-prefix=PIC32
+; RUN: llc -march=mipsel -relocation-model=static \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
+; RUN: < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=PIC16
+
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g3 = common global i32 0, align 4
+@g4 = common global i32 0, align 4
+@g5 = common global i32 0, align 4
+@g6 = common global i32 0, align 4
+@g7 = common global i32 0, align 4
+@g8 = common global i32 0, align 4
+@g9 = common global i32 0, align 4
+
+define i32 @caller1(i32 %a0) nounwind {
+entry:
+; PIC32-NOT: jalr
+; STATIC32-NOT: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee1(i32 1, i32 1, i32 1, i32 %a0) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee1(i32, i32, i32, i32)
+
+define i32 @caller2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee2(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee2(i32, i32, i32, i32, i32)
+
+define i32 @caller3(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee3(i32 1, i32 1, i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee3(i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller4(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee4(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee4(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller5() nounwind readonly {
+entry:
+; PIC32: .ent caller5
+; PIC32-NOT: jalr
+; PIC32: .end caller5
+; STATIC32: .ent caller5
+; STATIC32-NOT: jal
+; STATIC32: .end caller5
+; N64: .ent caller5
+; N64-NOT: jalr
+; N64: .end caller5
+; PIC16: .ent caller5
+; PIC16: jalrc
+; PIC16: .end caller5
+
+  %0 = load i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %2 = load i32* @g2, align 4
+  %3 = load i32* @g3, align 4
+  %4 = load i32* @g4, align 4
+  %5 = load i32* @g5, align 4
+  %6 = load i32* @g6, align 4
+  %7 = load i32* @g7, align 4
+  %8 = load i32* @g8, align 4
+  %9 = load i32* @g9, align 4
+  %call = tail call fastcc i32 @callee5(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9)
+  ret i32 %call
+}
+
+define internal fastcc i32 @callee5(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind readnone noinline {
+entry:
+  %add = add nsw i32 %a1, %a0
+  %add1 = add nsw i32 %add, %a2
+  %add2 = add nsw i32 %add1, %a3
+  %add3 = add nsw i32 %add2, %a4
+  %add4 = add nsw i32 %add3, %a5
+  %add5 = add nsw i32 %add4, %a6
+  %add6 = add nsw i32 %add5, %a7
+  %add7 = add nsw i32 %add6, %a8
+  %add8 = add nsw i32 %add7, %a9
+  ret i32 %add8
+}
+
+declare i32 @callee8(i32, ...)
+
+define i32 @caller8_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller8_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller8_1() nounwind noinline {
+entry:
+; PIC32: .ent caller8_1
+; PIC32: jalr
+; PIC32: .end caller8_1
+; STATIC32: .ent caller8_1
+; STATIC32: jal
+; STATIC32: .end caller8_1
+; N64: .ent caller8_1
+; N64-NOT: jalr
+; N64: .end caller8_1
+; PIC16: .ent caller8_1
+; PIC16: jalrc
+; PIC16: .end caller8_1
+
+  %call = tail call i32 (i32, ...)* @callee8(i32 2, i32 1) nounwind
+  ret i32 %call
+}
+
+%struct.S = type { [2 x i32] }
+
+@gs1 = external global %struct.S
+
+declare i32 @callee9(%struct.S* byval)
+
+define i32 @caller9_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller9_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller9_1() nounwind noinline {
+entry:
+; PIC32: .ent caller9_1
+; PIC32: jalr
+; PIC32: .end caller9_1
+; STATIC32: .ent caller9_1
+; STATIC32: jal
+; STATIC32: .end caller9_1
+; N64: .ent caller9_1
+; N64: jalr
+; N64: .end caller9_1
+; PIC16: .ent caller9_1
+; PIC16: jalrc
+; PIC16: .end caller9_1
+
+  %call = tail call i32 @callee9(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee10(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller10(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) nounwind {
+entry:
+; PIC32: .ent caller10
+; PIC32-NOT: jalr
+; STATIC32: .ent caller10
+; STATIC32-NOT: jal
+; N64: .ent caller10
+; N64-NOT: jalr
+; PIC16: .ent caller10
+; PIC16: jalrc
+
+  %call = tail call i32 @callee10(i32 %a8, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee11(%struct.S* byval)
+
+define i32 @caller11() nounwind noinline {
+entry:
+; PIC32: .ent caller11
+; PIC32: jalr
+; STATIC32: .ent caller11
+; STATIC32: jal
+; N64: .ent caller11
+; N64: jalr
+; PIC16: .ent caller11
+; PIC16: jalrc
+
+  %call = tail call i32 @callee11(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee12()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define i32 @caller12(%struct.S* nocapture byval %a0) nounwind {
+entry:
+; PIC32: .ent caller12
+; PIC32: jalr
+; STATIC32: .ent caller12
+; STATIC32: jal
+; N64: .ent caller12
+; N64: jalr
+; PIC16: .ent caller12
+; PIC16: jalrc
+
+  %0 = bitcast %struct.S* %a0 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct.S* @gs1 to i8*), i8* %0, i32 8, i32 4, i1 false)
+  %call = tail call i32 @callee12() nounwind
+  ret i32 %call
+}
+
+declare i32 @callee13(i32, ...)
+
+define i32 @caller13() nounwind {
+entry:
+; PIC32: .ent caller13
+; PIC32-NOT: jalr
+; STATIC32: .ent caller13
+; STATIC32-NOT: jal
+; N64: .ent caller13
+; N64-NOT: jalr
+; PIC16: .ent caller13
+; PIC16: jalrc
+
+  %call = tail call i32 (i32, ...)* @callee13(i32 1, i32 2) nounwind
+  ret i32 %call
+}
+
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index d681091f4c14b..ce98cc8262239 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
 @bar = hidden alias i32* @foo
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index a7ddb96e43382..72d30dc36912b 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=PIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:                             | FileCheck %s -check-prefix=STATIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:   -mips-fix-global-base-reg=false | FileCheck %s -check-prefix=STATICGP
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN:     FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
+; RUN:     %s | FileCheck %s -check-prefix=STATIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
+; RUN:     -mips-fix-global-base-reg=false < %s  | \
+; RUN:     FileCheck %s -check-prefix=STATICGP
 
 @t1 = thread_local global i32 0, align 4
 
diff --git a/test/CodeGen/Mips/tls16.ll b/test/CodeGen/Mips/tls16.ll
new file mode 100644
index 0000000000000..861864bcfe0f8
--- /dev/null
+++ b/test/CodeGen/Mips/tls16.ll
@@ -0,0 +1,13 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@a = thread_local global i32 4, align 4
+
+define i32 @foo() nounwind readonly {
+entry:
+  %0 = load i32* @a, align 4
+; PIC16:	lw	${{[0-9]+}}, %call16(__tls_get_addr)(${{[0-9]+}})
+; PIC16:	addiu	${{[0-9]+}}, %tlsgd(a)
+  ret i32 %0
+}
+
+
diff --git a/test/CodeGen/Mips/tls16_2.ll b/test/CodeGen/Mips/tls16_2.ll
new file mode 100644
index 0000000000000..b33e3c3766b69
--- /dev/null
+++ b/test/CodeGen/Mips/tls16_2.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@f.i = internal thread_local unnamed_addr global i32 1, align 4
+
+define i8* @f(i8* nocapture %a) nounwind {
+entry:
+  %0 = load i32* @f.i, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @f.i, align 4
+  %1 = inttoptr i32 %inc to i8*
+; PIC16: addiu	${{[0-9]+}}, %tlsldm(f.i)
+  ret i8* %1
+}
+
+
diff --git a/test/CodeGen/Mips/uitofp.ll b/test/CodeGen/Mips/uitofp.ll
new file mode 100644
index 0000000000000..aff70c24f07cc
--- /dev/null
+++ b/test/CodeGen/Mips/uitofp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips -mattr=+single-float < %s
+
+define void @f0() nounwind {
+entry:
+  %b = alloca i32, align 4
+  %a = alloca float, align 4
+  store volatile i32 1, i32* %b, align 4
+  %0 = load volatile i32* %b, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* %a, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/ul1.ll b/test/CodeGen/Mips/ul1.ll
new file mode 100644
index 0000000000000..7e64ff4d90fdc
--- /dev/null
+++ b/test/CodeGen/Mips/ul1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+%struct.ua = type <{ i16, i32 }>
+
+@foo = common global %struct.ua zeroinitializer, align 1
+
+define i32 @main() nounwind {
+entry:
+  store i32 10, i32* getelementptr inbounds (%struct.ua* @foo, i32 0, i32 1), align 1
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/vector-load-store.ll b/test/CodeGen/Mips/vector-load-store.ll
new file mode 100644
index 0000000000000..d889963099081
--- /dev/null
+++ b/test/CodeGen/Mips/vector-load-store.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+
+@g1 = common global <2 x i16> zeroinitializer, align 4
+@g0 = common global <2 x i16> zeroinitializer, align 4
+@g3 = common global <4 x i8> zeroinitializer, align 4
+@g2 = common global <4 x i8> zeroinitializer, align 4
+
+define void @func_v2i16() nounwind {
+entry:
+; CHECK: lw
+; CHECK: sw
+
+  %0 = load <2 x i16>* @g1, align 4
+  store <2 x i16> %0, <2 x i16>* @g0, align 4
+  ret void
+}
+
+define void @func_v4i8() nounwind {
+entry:
+; CHECK: lw
+; CHECK: sw
+
+  %0 = load <4 x i8>* @g3, align 4
+  store <4 x i8> %0, <4 x i8>* @g2, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/global-ordering.ll b/test/CodeGen/NVPTX/global-ordering.ll
new file mode 100644
index 0000000000000..43394a79e9122
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-ordering.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; Make sure we emit these globals in def-use order
+
+
+; PTX32:      .visible .global .align 1 .u8 a = 2;
+; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;
+; PTX64:      .visible .global .align 1 .u8 a = 2;
+; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;
+@a2 = addrspace(1) global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 2
+
+
+; PTX32:      .visible .global .align 1 .u8 b = 1;
+; PTX32-NEXT: .visible .global .align 4 .u32 b2[2] = {b, b};
+; PTX64:      .visible .global .align 1 .u8 b = 1;
+; PTX64-NEXT: .visible .global .align 8 .u64 b2[2] = {b, b};
+@b2 = addrspace(1) global [2 x i8 addrspace(1)*] [i8 addrspace(1)* @b, i8 addrspace(1)* @b]
+@b = addrspace(1) global i8 1
diff --git a/test/CodeGen/NVPTX/param-align.ll b/test/CodeGen/NVPTX/param-align.ll
new file mode 100644
index 0000000000000..84ccb650d40d9
--- /dev/null
+++ b/test/CodeGen/NVPTX/param-align.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+;;; Need 4-byte alignment on float* passed byval
+define ptx_device void @t1(float* byval %x) {
+; CHECK: .func t1
+; CHECK: .param .align 4 .b8 t1_param_0[4]
+  ret void
+}
+
+
+;;; Need 8-byte alignment on double* passed byval
+define ptx_device void @t2(double* byval %x) {
+; CHECK: .func t2
+; CHECK: .param .align 8 .b8 t2_param_0[8]
+  ret void
+}
+
+
+;;; Need 4-byte alignment on float2* passed byval
+%struct.float2 = type { float, float }
+define ptx_device void @t3(%struct.float2* byval %x) {
+; CHECK: .func t3
+; CHECK: .param .align 4 .b8 t3_param_0[8]
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
new file mode 100644
index 0000000000000..779f7798d8839
--- /dev/null
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+define ptx_kernel void @t1(i1* %a) {
+; PTX32:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}};
+; PTX64:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}};
+  store i1 false, i1* %a
+  ret void
+}
+
+
+define ptx_kernel void @t2(i1* %a, i8* %b) {
+; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+
+  %t1 = load i1* %a
+  %t2 = select i1 %t1, i8 1, i8 2
+  store i8 %t2, i8* %b
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll
new file mode 100644
index 0000000000000..0422b01f4ee35
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+
+
+; CHECK: .version 3.0
+
diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll
new file mode 100644
index 0000000000000..d6e57301a371e
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-31.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+
+
+; CHECK: .version 3.1
+
diff --git a/test/CodeGen/NVPTX/sm-version-10.ll b/test/CodeGen/NVPTX/sm-version-10.ll
new file mode 100644
index 0000000000000..9324a3780986c
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-10.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+
+; CHECK: .target sm_10
+
diff --git a/test/CodeGen/NVPTX/sm-version-11.ll b/test/CodeGen/NVPTX/sm-version-11.ll
new file mode 100644
index 0000000000000..9033a4eba5e46
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-11.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_11 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_11 | FileCheck %s
+
+
+; CHECK: .target sm_11
+
diff --git a/test/CodeGen/NVPTX/sm-version-12.ll b/test/CodeGen/NVPTX/sm-version-12.ll
new file mode 100644
index 0000000000000..d8ee85c9010e7
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-12.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_12 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_12 | FileCheck %s
+
+
+; CHECK: .target sm_12
+
diff --git a/test/CodeGen/NVPTX/sm-version-13.ll b/test/CodeGen/NVPTX/sm-version-13.ll
new file mode 100644
index 0000000000000..ad67d642ce306
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-13.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_13 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_13 | FileCheck %s
+
+
+; CHECK: .target sm_13
+
diff --git a/test/CodeGen/NVPTX/sm-version-20.ll b/test/CodeGen/NVPTX/sm-version-20.ll
new file mode 100644
index 0000000000000..c21f49e6aeb96
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-20.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: .target sm_20
+
diff --git a/test/CodeGen/NVPTX/sm-version-21.ll b/test/CodeGen/NVPTX/sm-version-21.ll
new file mode 100644
index 0000000000000..4fb6de3e6323a
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-21.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_21 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_21 | FileCheck %s
+
+
+; CHECK: .target sm_21
+
diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll
new file mode 100644
index 0000000000000..692b49a0d6b37
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+
+
+; CHECK: .target sm_30
+
diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll
new file mode 100644
index 0000000000000..25368a01335e1
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-35.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+
+; CHECK: .target sm_35
+
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index 0003a17c2284d..b95ac6880758b 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -9,9 +9,8 @@ target triple = "powerpc-apple-darwin11.0"
 
 define void @foo() nounwind ssp {
 entry:
-; Better: mtctr r12
-; CHECK: mr r12, [[REG:r[0-9]+]]
-; CHECK: mtctr [[REG]]
+; CHECK: mtctr r12
+; CHECK: bctrl
   %0 = load void (...)** @p, align 4              ; <void (...)*> [#uses=1]
   call void (...)* %0() nounwind
   br label %return
diff --git a/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
new file mode 100644
index 0000000000000..9d2e390c1c972
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; This test check if the TOC entry symbol name won't clash with global .LC0
+; and .LC2 symbols defined in the module.
+
+@.LC0 = internal global [5 x i8] c".LC0\00"
+@.LC2 = internal global [5 x i8] c".LC2\00"
+
+define i32 @foo(double %X, double %Y) nounwind readnone {
+  ; The 1.0 and 3.0 constants generate two TOC entries
+  %cmp = fcmp oeq double %X, 1.000000e+00
+  %conv = zext i1 %cmp to i32
+  %cmp1 = fcmp oeq double %Y, 3.000000e+00
+  %conv2 = zext i1 %cmp1 to i32
+  %add = add nsw i32 %conv2, %conv
+  ret i32 %add
+}
+
+; Check the creation of 2 .tc entries for both double constants. They
+; should be .LC1 and .LC3 to avoid name clash with global constants
+; .LC0 and .LC2
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll b/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll
new file mode 100644
index 0000000000000..41533a8f322b8
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @test(i64 %n) nounwind {
+entry:
+  %0 = alloca i8, i64 %n, align 1
+  %1 = alloca i8, i64 %n, align 1
+  call void @use(i8* %0, i8* %1) nounwind
+  ret void
+}
+
+declare void @use(i8*, i8*)
+
+; Check we actually have two instances of dynamic stack allocation,
+; identified by the stdux used to update the back-chain link.
+; CHECK: stdux
+; CHECK: stdux
diff --git a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
new file mode 100644
index 0000000000000..f841c5fb92e4a
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mattr=+altivec < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test(<16 x i8> %v) nounwind {
+entry:
+  %0 = bitcast <16 x i8> %v to i128
+  %1 = lshr i128 %0, 96
+  %2 = trunc i128 %1 to i32
+  ret i32 %2
+}
+
+; Verify that bitcast handles big-endian platforms correctly
+; by checking we load the result from the correct offset
+
+; CHECK: addi [[REGISTER:[0-9]+]], 1, -16
+; CHECK: stvx 2, 0, [[REGISTER]]
+; CHECK: lwz 3, -16(1)
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/asm-Zy.ll b/test/CodeGen/PowerPC/asm-Zy.ll
new file mode 100644
index 0000000000000..691165f237881
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-Zy.ll
@@ -0,0 +1,14 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
+
+define i32 @zytest(i32 %a) nounwind {
+entry:
+; CHECK: @zytest
+  %r = call i32 asm "lwbrx $0, ${1:y}", "=r,Z"(i32 %a) nounwind, !srcloc !0
+  ret i32 %r
+; CHECK: lwbrx 3, 0,
+}
+
+!0 = metadata !{i32 101688}
+
diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index 9a456b6ecc517..638059a38ef5e 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll
@@ -2,10 +2,10 @@
 
 declare void @bar(i64 %x, i64 %y)
 
-; CHECK: li {{[53]}}, 0
+; CHECK: li 3, 0
 ; CHECK: li 4, 2
+; CHECK: li 5, 0
 ; CHECK: li 6, 3
-; CHECK: mr {{[53]}}, {{[53]}}
 
 define void @foo() {
   call void @bar(i64 2, i64 3)
diff --git a/test/CodeGen/PowerPC/bl8_elf_nop.ll b/test/CodeGen/PowerPC/bl8_elf_nop.ll
deleted file mode 100644
index 386c59e322386..0000000000000
--- a/test/CodeGen/PowerPC/bl8_elf_nop.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck  %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-declare i32 @clock() nounwind
-
-define i32 @func() {
-entry:
-  %call = call i32 @clock() nounwind
-  %call2 = add i32 %call, 7
-  ret i32 %call2
-}
-
-; CHECK: bl clock
-; CHECK-NEXT: nop
-
diff --git a/test/CodeGen/PowerPC/coalesce-ext.ll b/test/CodeGen/PowerPC/coalesce-ext.ll
index cc80f8330798b..f19175c9beaac 100644
--- a/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -13,5 +13,6 @@ define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
   store volatile i32 %D, i32* %P
   ; Reuse low bits of extended register, don't extend live range of SUM.
   ; CHECK: stw [[EXT]]
-  ret i32 %D
+  %R = add i32 %D, %D
+  ret i32 %R
 }
diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
new file mode 100644
index 0000000000000..afa1ea8e75a1a
--- /dev/null
+++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [3 x i8] c"%i\00", align 1
+
+define void @test(i32 %count) nounwind {
+entry:
+; CHECK: crxor 6, 6, 6
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %cmp2 = icmp sgt i32 %count, 0
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+; CHECK: crxor 6, 6, 6
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %inc = add nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/crsave.ll b/test/CodeGen/PowerPC/crsave.ll
new file mode 100644
index 0000000000000..3e98dbd254d9d
--- /dev/null
+++ b/test/CodeGen/PowerPC/crsave.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC64
+
+declare void @foo()
+
+define i32 @test_cr2() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+
+define i32 @test_cr234() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09cmp 3,$2,$2\0A\09cmp 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+; PPC32-NEXT: mtcrf 16, 12
+; PPC32-NEXT: mtcrf 8, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+; PPC64-NEXT: mtcrf 16, 12
+; PPC64-NEXT: mtcrf 8, 12
+
diff --git a/test/CodeGen/PowerPC/emptystruct.ll b/test/CodeGen/PowerPC/emptystruct.ll
new file mode 100644
index 0000000000000..36b4abd2bfad6
--- /dev/null
+++ b/test/CodeGen/PowerPC/emptystruct.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests correct handling of empty aggregate parameters and return values.
+; An empty parameter passed by value does not consume a protocol register or
+; a parameter save area doubleword.  An empty parameter passed by reference
+; is treated as any other pointer parameter.  An empty aggregate return value 
+; is treated as any other aggregate return value, passed via address as a 
+; hidden parameter in GPR3.  In this example, GPR3 contains the return value
+; address, GPR4 contains the address of e2, and e1 and e3 are not passed or
+; received.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.empty = type {}
+
+define void @callee(%struct.empty* noalias sret %agg.result, %struct.empty* byval %a1, %struct.empty* %a2, %struct.empty* byval %a3) nounwind {
+entry:
+  %a2.addr = alloca %struct.empty*, align 8
+  store %struct.empty* %a2, %struct.empty** %a2.addr, align 8
+  %0 = load %struct.empty** %a2.addr, align 8
+  %1 = bitcast %struct.empty* %agg.result to i8*
+  %2 = bitcast %struct.empty* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 0, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: callee:
+; CHECK: std 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: blr
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @caller(%struct.empty* noalias sret %agg.result) nounwind {
+entry:
+  %e1 = alloca %struct.empty, align 1
+  %e2 = alloca %struct.empty, align 1
+  %e3 = alloca %struct.empty, align 1
+  call void @callee(%struct.empty* sret %agg.result, %struct.empty* byval %e1, %struct.empty* %e2, %struct.empty* byval %e3)
+  ret void
+}
+
+; CHECK: caller:
+; CHECK: addi 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: bl callee
diff --git a/test/CodeGen/PowerPC/floatPSA.ll b/test/CodeGen/PowerPC/floatPSA.ll
new file mode 100644
index 0000000000000..b5631a1605616
--- /dev/null
+++ b/test/CodeGen/PowerPC/floatPSA.ll
@@ -0,0 +1,97 @@
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; This verifies that single-precision floating point values that can't
+; be passed in registers are stored in the rightmost word of the parameter
+; save area slot.  There are 13 architected floating-point registers, so
+; the 14th is passed in storage.  The address of the 14th argument is
+; 48 (fixed size of the linkage area) + 13 * 8 (first 13 args) + 4
+; (offset to second word) = 156.
+
+define float @bar(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l, float %m, float %n) nounwind {
+entry:
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  %c.addr = alloca float, align 4
+  %d.addr = alloca float, align 4
+  %e.addr = alloca float, align 4
+  %f.addr = alloca float, align 4
+  %g.addr = alloca float, align 4
+  %h.addr = alloca float, align 4
+  %i.addr = alloca float, align 4
+  %j.addr = alloca float, align 4
+  %k.addr = alloca float, align 4
+  %l.addr = alloca float, align 4
+  %m.addr = alloca float, align 4
+  %n.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  store float %c, float* %c.addr, align 4
+  store float %d, float* %d.addr, align 4
+  store float %e, float* %e.addr, align 4
+  store float %f, float* %f.addr, align 4
+  store float %g, float* %g.addr, align 4
+  store float %h, float* %h.addr, align 4
+  store float %i, float* %i.addr, align 4
+  store float %j, float* %j.addr, align 4
+  store float %k, float* %k.addr, align 4
+  store float %l, float* %l.addr, align 4
+  store float %m, float* %m.addr, align 4
+  store float %n, float* %n.addr, align 4
+  %0 = load float* %n.addr, align 4
+  ret float %0
+}
+
+; CHECK: lfs {{[0-9]+}}, 156(1)
+
+define float @foo() nounwind {
+entry:
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  %d = alloca float, align 4
+  %e = alloca float, align 4
+  %f = alloca float, align 4
+  %g = alloca float, align 4
+  %h = alloca float, align 4
+  %i = alloca float, align 4
+  %j = alloca float, align 4
+  %k = alloca float, align 4
+  %l = alloca float, align 4
+  %m = alloca float, align 4
+  %n = alloca float, align 4
+  store float 1.000000e+00, float* %a, align 4
+  store float 2.000000e+00, float* %b, align 4
+  store float 3.000000e+00, float* %c, align 4
+  store float 4.000000e+00, float* %d, align 4
+  store float 5.000000e+00, float* %e, align 4
+  store float 6.000000e+00, float* %f, align 4
+  store float 7.000000e+00, float* %g, align 4
+  store float 8.000000e+00, float* %h, align 4
+  store float 9.000000e+00, float* %i, align 4
+  store float 1.000000e+01, float* %j, align 4
+  store float 1.100000e+01, float* %k, align 4
+  store float 1.200000e+01, float* %l, align 4
+  store float 1.300000e+01, float* %m, align 4
+  store float 1.400000e+01, float* %n, align 4
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %2 = load float* %c, align 4
+  %3 = load float* %d, align 4
+  %4 = load float* %e, align 4
+  %5 = load float* %f, align 4
+  %6 = load float* %g, align 4
+  %7 = load float* %h, align 4
+  %8 = load float* %i, align 4
+  %9 = load float* %j, align 4
+  %10 = load float* %k, align 4
+  %11 = load float* %l, align 4
+  %12 = load float* %m, align 4
+  %13 = load float* %n, align 4
+  %call = call float @bar(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13)
+  ret float %call
+}
+
+; Note that stw is used instead of stfs because the value is a simple
+; constant that can be created with a load-immediate in a GPR.
+; CHECK: stw {{[0-9]+}}, 156(1)
+
diff --git a/test/CodeGen/PowerPC/fsl-e500mc.ll b/test/CodeGen/PowerPC/fsl-e500mc.ll
new file mode 100644
index 0000000000000..09b7e41b18990
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e500mc.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e500mc and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e500mc < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-fsl-linux"
+
+%struct.teststruct = type { [12 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 52, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/fsl-e5500.ll b/test/CodeGen/PowerPC/fsl-e5500.ll
new file mode 100644
index 0000000000000..d47d8c8ed4f31
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e5500.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e5500 and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e5500 < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-fsl-linux"
+
+%struct.teststruct = type { [24 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 100, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
new file mode 100644
index 0000000000000..5a0c072c9c52e
--- /dev/null
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define float @test(i64 %x) nounwind readnone {
+entry:
+  %conv = sitofp i64 %x to float
+  ret float %conv
+}
+
+; Verify that we get the code sequence needed to avoid double-rounding.
+; Note that only parts of the sequence are checked for here, to allow
+; for minor code generation differences.
+
+; CHECK: sradi [[REGISTER:[0-9]+]], 3, 53
+; CHECK: addi [[REGISTER:[0-9]+]], [[REGISTER]], 1
+; CHECK: cmpldi 0, [[REGISTER]], 1
+; CHECK: isel [[REGISTER:[0-9]+]], {{[0-9]+}}, 3, 1
+; CHECK: std [[REGISTER]], -{{[0-9]+}}(1)
+
+
+; Also check that with -enable-unsafe-fp-math we do not get that extra
+; code sequence.  Simply verify that there is no "isel" present.
+
+; RUN: llc -mcpu=pwr7 -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE
+; CHECK-UNSAFE-NOT: isel
+
diff --git a/test/CodeGen/PowerPC/inlineasm-copy.ll b/test/CodeGen/PowerPC/inlineasm-copy.ll
index e1ff82d5f9b7f..59c3388835610 100644
--- a/test/CodeGen/PowerPC/inlineasm-copy.ll
+++ b/test/CodeGen/PowerPC/inlineasm-copy.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=ppc32 | not grep mr
+; RUN: llc < %s -march=ppc32 -verify-machineinstrs | FileCheck %s
 
+; CHECK-NOT: mr
 define i32 @test(i32 %Y, i32 %X) {
 entry:
         %tmp = tail call i32 asm "foo $0", "=r"( )              ; <i32> [#uses=1]
@@ -12,3 +13,9 @@ entry:
         ret i32 %tmp1
 }
 
+; CHECK: test3
+define i32 @test3(i32 %Y, i32 %X) {
+entry:
+        %tmp1 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "foo $0, $1", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"( i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y )                ; <i32> [#uses=1]
+       ret i32 1
+}
diff --git a/test/CodeGen/PowerPC/int-fp-conv-1.ll b/test/CodeGen/PowerPC/int-fp-conv-1.ll
index 6c8272351924c..d2887b9b947e9 100644
--- a/test/CodeGen/PowerPC/int-fp-conv-1.ll
+++ b/test/CodeGen/PowerPC/int-fp-conv-1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc64 | grep __floatditf
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+; CHECK-NOT: __floatditf
 
 define i64 @__fixunstfdi(ppc_fp128 %a) nounwind  {
 entry:
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
new file mode 100644
index 0000000000000..62aa7cf929f87
--- /dev/null
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests receiving and re-passing parameters consisting of structures
+; of size 3, 5, 6, and 7.  They are to be found/placed right-adjusted in
+; the parameter registers.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S3 = type { [3 x i8] }
+%struct.S5 = type { [5 x i8] }
+%struct.S6 = type { [6 x i8] }
+%struct.S7 = type { [7 x i8] }
+
+define void @test(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7) nounwind {
+entry:
+  call void @check(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7)
+  ret void
+}
+
+; CHECK: std 6, 216(1)
+; CHECK: std 5, 208(1)
+; CHECK: std 4, 200(1)
+; CHECK: std 3, 192(1)
+; CHECK: lbz {{[0-9]+}}, 199(1)
+; CHECK: stb {{[0-9]+}}, 55(1)
+; CHECK: lhz {{[0-9]+}}, 197(1)
+; CHECK: sth {{[0-9]+}}, 53(1)
+; CHECK: lbz {{[0-9]+}}, 207(1)
+; CHECK: stb {{[0-9]+}}, 63(1)
+; CHECK: lwz {{[0-9]+}}, 203(1)
+; CHECK: stw {{[0-9]+}}, 59(1)
+; CHECK: lhz {{[0-9]+}}, 214(1)
+; CHECK: sth {{[0-9]+}}, 70(1)
+; CHECK: lwz {{[0-9]+}}, 210(1)
+; CHECK: stw {{[0-9]+}}, 66(1)
+; CHECK: lbz {{[0-9]+}}, 223(1)
+; CHECK: stb {{[0-9]+}}, 79(1)
+; CHECK: lhz {{[0-9]+}}, 221(1)
+; CHECK: sth {{[0-9]+}}, 77(1)
+; CHECK: lwz {{[0-9]+}}, 217(1)
+; CHECK: stw {{[0-9]+}}, 73(1)
+; CHECK: ld 6, 72(1)
+; CHECK: ld 5, 64(1)
+; CHECK: ld 4, 56(1)
+; CHECK: ld 3, 48(1)
+
+declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)
diff --git a/test/CodeGen/PowerPC/misched.ll b/test/CodeGen/PowerPC/misched.ll
new file mode 100644
index 0000000000000..d6fb3b30464f0
--- /dev/null
+++ b/test/CodeGen/PowerPC/misched.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -enable-misched -verify-machineinstrs
+; PR14302
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+@b = external global [16000 x double], align 32
+
+define void @pr14302() nounwind {
+entry:
+  tail call void @putchar() nounwind
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.body24.i
+
+for.body24.i:                                     ; preds = %for.body24.i, %for.body
+  store double 1.000000e+00, double* undef, align 8
+  br i1 undef, label %for.body24.i58, label %for.body24.i
+
+for.body24.i58:                                   ; preds = %for.body24.i58, %for.body24.i
+  %arrayidx26.i55.1 = getelementptr inbounds [16000 x double]* @b, i64 0, i64 undef
+  store double 1.000000e+00, double* %arrayidx26.i55.1, align 8
+  br i1 undef, label %for.body24.i64, label %for.body24.i58
+
+for.body24.i64:                                   ; preds = %for.body24.i64, %for.body24.i58
+  %exitcond.2489 = icmp eq i32 0, 16000
+  br i1 %exitcond.2489, label %for.body24.i70, label %for.body24.i64
+
+for.body24.i70:                                   ; preds = %for.body24.i70, %for.body24.i64
+  br i1 undef, label %for.body24.i76, label %for.body24.i70
+
+for.body24.i76:                                   ; preds = %for.body24.i76, %for.body24.i70
+  br i1 undef, label %set1d.exit77, label %for.body24.i76
+
+set1d.exit77:                                     ; preds = %for.body24.i76
+  br label %for.body29
+
+for.body29:                                       ; preds = %for.body29, %set1d.exit77
+  br i1 undef, label %for.end35, label %for.body29
+
+for.end35:                                        ; preds = %for.body29
+  ret void
+}
+
+declare void @putchar()
diff --git a/test/CodeGen/PowerPC/novrsave.ll b/test/CodeGen/PowerPC/novrsave.ll
new file mode 100644
index 0000000000000..a70576a291e99
--- /dev/null
+++ b/test/CodeGen/PowerPC/novrsave.ll
@@ -0,0 +1,15 @@
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu   < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; This verifies that the code to update VRSAVE has been removed for SVR4.
+
+define <4 x float> @bar(<4 x float> %v) nounwind {
+entry:
+  %v.addr = alloca <4 x float>, align 16
+  store <4 x float> %v, <4 x float>* %v.addr, align 16
+  %0 = load <4 x float>* %v.addr, align 16
+  ret <4 x float> %0
+}
+
+; CHECK-NOT: mfspr
+; CHECK-NOT: mtspr
diff --git a/test/CodeGen/PowerPC/ppc64-abi-extend.ll b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
new file mode 100644
index 0000000000000..8baf1c613e786
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
@@ -0,0 +1,97 @@
+; Verify that i32 argument/return values are extended to i64
+
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@si = common global i32 0, align 4
+@ui = common global i32 0, align 4
+
+declare void @arg_si(i32 signext)
+declare void @arg_ui(i32 zeroext)
+
+declare signext i32 @ret_si()
+declare zeroext i32 @ret_ui()
+
+define void @pass_arg_si() nounwind {
+entry:
+  %0 = load i32* @si, align 4
+  tail call void @arg_si(i32 signext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_si
+; CHECK: lwa 3,
+; CHECK: bl arg_si
+
+define void @pass_arg_ui() nounwind {
+entry:
+  %0 = load i32* @ui, align 4
+  tail call void @arg_ui(i32 zeroext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_ui
+; CHECK: lwz 3,
+; CHECK: bl arg_ui
+
+define i64 @use_arg_si(i32 signext %x) nounwind readnone {
+entry:
+  %conv = sext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_si
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define i64 @use_arg_ui(i32 zeroext %x) nounwind readnone {
+entry:
+  %conv = zext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_ui
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define signext i32 @pass_ret_si() nounwind readonly {
+entry:
+  %0 = load i32* @si, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_si
+; CHECK: lwa 3,
+; CHECK: blr
+
+define zeroext i32 @pass_ret_ui() nounwind readonly {
+entry:
+  %0 = load i32* @ui, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_ui
+; CHECK: lwz 3,
+; CHECK: blr
+
+define i64 @use_ret_si() nounwind {
+entry:
+  %call = tail call signext i32 @ret_si() nounwind
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_si
+; CHECK: bl ret_si
+; This is to verify the return register (3) set up by the ret_si
+; call is passed on unmodified as return value of use_ret_si.
+; CHECK-NOT: 3
+; CHECK: blr
+
+define i64 @use_ret_ui() nounwind {
+entry:
+  %call = tail call zeroext i32 @ret_ui() nounwind
+  %conv = zext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_ui
+; CHECK: bl ret_ui
+; This is to verify the return register (3) set up by the ret_ui
+; call is passed on unmodified as return value of use_ret_ui.
+; CHECK-NOT: 3
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
new file mode 100644
index 0000000000000..10b70d02e5cc3
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; Verify internal alignment of long double in a struct.  The double
+; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
+; the long double.  Check that these are stored to proper locations
+; in the parameter save area and loaded from there for return in FPR1/2.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S = type { double, ppc_fp128 }
+
+define ppc_fp128 @test(%struct.S* byval %x) nounwind {
+entry:
+  %b = getelementptr inbounds %struct.S* %x, i32 0, i32 1
+  %0 = load ppc_fp128* %b, align 16
+  ret ppc_fp128 %0
+}
+
+; CHECK: std 6, 72(1)
+; CHECK: std 5, 64(1)
+; CHECK: std 4, 56(1)
+; CHECK: std 3, 48(1)
+; CHECK: lfd 1, 64(1)
+; CHECK: lfd 2, 72(1)
+
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
new file mode 100644
index 0000000000000..c382edbbce4e7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind readnone noinline {
+  ret void
+}
+
+define weak void @foo_weak() nounwind {
+  ret void
+}
+
+; Calls to local function does not require the TOC restore 'nop'
+define void @test_direct() nounwind readnone {
+; CHECK: test_direct:
+  tail call void @foo() nounwind
+; CHECK: bl foo
+; CHECK-NOT: nop
+  ret void
+}
+
+; Calls to weak function requires a TOC restore 'nop' because they
+; may be overridden in a different module.
+define void @test_weak() nounwind readnone {
+; CHECK: test_weak:
+  tail call void @foo_weak() nounwind
+; CHECK: bl foo
+; CHECK-NEXT: nop
+  ret void
+}
+
+; Indirect calls requires a full stub creation
+define void @test_indirect(void ()* nocapture %fp) nounwind {
+; CHECK: test_indirect:
+  tail call void %fp() nounwind
+; CHECK: ld [[FP:[0-9]+]], 0(3)
+; CHECK: ld 11, 16(3)
+; CHECK: ld 2, 8(3)
+; CHECK-NEXT: mtctr [[FP]]
+; CHECK-NEXT: bctrl
+; CHECK-NEXT: ld 2, 40(1)
+  ret void
+}
+
+; Absolute vales should be have the TOC restore 'nop'
+define void @test_abs() nounwind {
+; CHECK: test_abs:
+  tail call void inttoptr (i64 1024 to void ()*)() nounwind
+; CHECK: bla 1024
+; CHECK-NEXT: nop
+  ret void
+}
+
+declare double @sin(double) nounwind
+
+; External functions call should also have a 'nop'
+define double @test_external(double %x) nounwind {
+; CHECK: test_external:
+  %call = tail call double @sin(double %x) nounwind
+; CHECK: bl sin
+; CHECK-NEXT: nop
+  ret double %call
+}
diff --git a/test/CodeGen/PowerPC/ppc64-ind-call.ll b/test/CodeGen/PowerPC/ppc64-ind-call.ll
deleted file mode 100644
index d5c4d468c6562..0000000000000
--- a/test/CodeGen/PowerPC/ppc64-ind-call.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-define void @test1() {
-entry:
-  %call.i75 = call zeroext i8 undef(i8* undef, i8 zeroext 10)
-  unreachable
-}
-
-; CHECK: @test1
-; CHECK: ld 11, 0(3)
-; CHECK: ld 2, 8(3)
-; CHECK: bctrl
-; CHECK: ld 2, 40(1)
-
diff --git a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
index e5aa1f169f640..e1d50bac51a2c 100644
--- a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
+++ b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
@@ -5,6 +5,7 @@
 ; CHECK-NEXT:	.align 3
 ; CHECK-NEXT:	.quad .L.test1
 ; CHECK-NEXT:	.quad .TOC.@tocbase
+; CHECK-NEXT:   .quad 0
 ; CHECK-NEXT:	.text
 ; CHECK-NEXT: .L.test1:
 
diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
new file mode 100644
index 0000000000000..a29bdcb25031f
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@double_array = global [32 x double] zeroinitializer, align 8
+@number64 = global i64 10, align 8
+@internal_static_var.x = internal unnamed_addr global i64 0, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+; CHECK: access_int64:
+; CHECK-NEXT: .align  3
+; CHECK-NEXT: .quad   .L.access_int64
+; CHECK-NEXT: .quad   .TOC.@tocbase
+; CHECK-NEXT: .quad   0
+; CHECK-NEXT: .text
+  %0 = load i64* @number64, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1
+}
+
+define i64 @internal_static_var(i64 %a) nounwind {
+entry:
+; CHECK: internal_static_var:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %0 = load i64* @internal_static_var.x, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1 
+}
+
+define i32 @access_double(double %a) nounwind readnone {
+entry:
+; CHECK: access_double:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %a, 2.000000e+00
+  %conv = zext i1 %cmp to i32 
+  ret i32 %conv
+}
+
+
+define i32 @access_double_array(double %a, i32 %i) nounwind readonly {
+entry:
+; CHECK: access_double_array:
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [32 x double]* @double_array, i64 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %0, %a
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Check the creation of 4 .tc entries:
+; * int64_t global 'number64'
+; * double constant 2.0
+; * double array 'double_array'
+; * static int64_t 'x' accessed within '@internal_static_var'
+; CHECK: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/ppc64-zext.ll b/test/CodeGen/PowerPC/ppc64-zext.ll
new file mode 100644
index 0000000000000..eb55445cc6c99
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux"
+
+define i64 @fun(i32 %arg32) nounwind {
+entry:
+; CHECK: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  %o = zext i32 %arg32 to i64
+  ret i64 %o
+}
+
diff --git a/test/CodeGen/PowerPC/pr12757.ll b/test/CodeGen/PowerPC/pr12757.ll
new file mode 100644
index 0000000000000..c344656d29834
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr12757.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @__flt_rounds() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "mffs $0", "=f"() nounwind
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK: @__flt_rounds
+; CHECK: mffs
+
diff --git a/test/CodeGen/PowerPC/pr13641.ll b/test/CodeGen/PowerPC/pr13641.ll
new file mode 100644
index 0000000000000..c4d3f3a9dc60d
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13641.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind {
+  ret void
+}
+
+; CHECK: blr
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .quad 0
diff --git a/test/CodeGen/PowerPC/pr13891.ll b/test/CodeGen/PowerPC/pr13891.ll
new file mode 100644
index 0000000000000..3ae73850a342f
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13891.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.foo = type { i8, i8 }
+
+define void @_Z5check3foos(%struct.foo* nocapture byval %f, i16 signext %i) noinline {
+; CHECK: _Z5check3foos:
+; CHECK: sth 3, {{[0-9]+}}(1)
+; CHECK: lha {{[0-9]+}}, {{[0-9]+}}(1)
+entry:
+  %0 = bitcast %struct.foo* %f to i16*
+  %1 = load i16* %0, align 2
+  %bf.val.sext = ashr i16 %1, 8
+  %cmp = icmp eq i16 %bf.val.sext, %i
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %conv = sext i16 %bf.val.sext to i32
+  tail call void @exit(i32 %conv)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+declare void @exit(i32)
diff --git a/test/CodeGen/PowerPC/remat-imm.ll b/test/CodeGen/PowerPC/remat-imm.ll
new file mode 100644
index 0000000000000..520921f57a93b
--- /dev/null
+++ b/test/CodeGen/PowerPC/remat-imm.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = 'test.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [6 x i8] c"%d,%d\00", align 1
+
+define i32 @main() nounwind {
+entry:
+; CHECK: li 4, 128
+; CHECK-NOT: mr 4, {{.*}}
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
new file mode 100644
index 0000000000000..884d3a89d15aa
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -0,0 +1,227 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads.  This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stw {{[0-9]+}}, 132(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: std {{[0-9]+}}, 144(1)
+; CHECK: std {{[0-9]+}}, 152(1)
+; CHECK: std {{[0-9]+}}, 160(1)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %z1.addr = alloca i32, align 4
+  %z2.addr = alloca i32, align 4
+  %z3.addr = alloca i32, align 4
+  %z4.addr = alloca i32, align 4
+  %z5.addr = alloca i32, align 4
+  %z6.addr = alloca i32, align 4
+  %z7.addr = alloca i32, align 4
+  %z8.addr = alloca i32, align 4
+  store i32 %z1, i32* %z1.addr, align 4
+  store i32 %z2, i32* %z2.addr, align 4
+  store i32 %z3, i32* %z3.addr, align 4
+  store i32 %z4, i32* %z4.addr, align 4
+  store i32 %z5, i32* %z5.addr, align 4
+  store i32 %z6, i32* %z6.addr, align 4
+  store i32 %z7, i32* %z7.addr, align 4
+  store i32 %z8, i32* %z8.addr, align 4
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lha {{[0-9]+}}, 132(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lwz {{[0-9]+}}, 144(1)
+; CHECK: lwz {{[0-9]+}}, 152(1)
+; CHECK: lwz {{[0-9]+}}, 160(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stb {{[0-9]+}}, 135(1)
+; CHECK: sth {{[0-9]+}}, 133(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: stb {{[0-9]+}}, 151(1)
+; CHECK: stw {{[0-9]+}}, 147(1)
+; CHECK: sth {{[0-9]+}}, 158(1)
+; CHECK: stw {{[0-9]+}}, 154(1)
+; CHECK: stb {{[0-9]+}}, 167(1)
+; CHECK: sth {{[0-9]+}}, 165(1)
+; CHECK: stw {{[0-9]+}}, 161(1)
+}
+
+define internal i32 @callee2(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %z1.addr = alloca i32, align 4
+  %z2.addr = alloca i32, align 4
+  %z3.addr = alloca i32, align 4
+  %z4.addr = alloca i32, align 4
+  %z5.addr = alloca i32, align 4
+  %z6.addr = alloca i32, align 4
+  %z7.addr = alloca i32, align 4
+  %z8.addr = alloca i32, align 4
+  store i32 %z1, i32* %z1.addr, align 4
+  store i32 %z2, i32* %z2.addr, align 4
+  store i32 %z3, i32* %z3.addr, align 4
+  store i32 %z4, i32* %z4.addr, align 4
+  store i32 %z5, i32* %z5.addr, align 4
+  store i32 %z6, i32* %z6.addr, align 4
+  store i32 %z7, i32* %z7.addr, align 4
+  store i32 %z8, i32* %z8.addr, align 4
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: lbz {{[0-9]+}}, 149(1)
+; CHECK: lbz {{[0-9]+}}, 150(1)
+; CHECK: lbz {{[0-9]+}}, 147(1)
+; CHECK: lbz {{[0-9]+}}, 148(1)
+; CHECK: lbz {{[0-9]+}}, 133(1)
+; CHECK: lbz {{[0-9]+}}, 134(1)
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lhz {{[0-9]+}}, 154(1)
+; CHECK: lhz {{[0-9]+}}, 156(1)
+; CHECK: lbz {{[0-9]+}}, 163(1)
+; CHECK: lbz {{[0-9]+}}, 164(1)
+; CHECK: lbz {{[0-9]+}}, 161(1)
+; CHECK: lbz {{[0-9]+}}, 162(1)
+}
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
new file mode 100644
index 0000000000000..ef706af95d659
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -0,0 +1,213 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads.  This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(%struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: ld 9, 128(31)
+; CHECK: ld 8, 136(31)
+; CHECK: ld 7, 144(31)
+; CHECK: lwz 6, 152(31)
+; CHECK: lwz 5, 160(31)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(%struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
+; CHECK: stw 5, 68(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lha {{[0-9]+}}, 68(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lwz {{[0-9]+}}, 80(1)
+; CHECK: lwz {{[0-9]+}}, 88(1)
+; CHECK: lwz {{[0-9]+}}, 96(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(%struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 71(1)
+; CHECK: sth {{[0-9]+}}, 69(1)
+; CHECK: stb {{[0-9]+}}, 87(1)
+; CHECK: stw {{[0-9]+}}, 83(1)
+; CHECK: sth {{[0-9]+}}, 94(1)
+; CHECK: stw {{[0-9]+}}, 90(1)
+; CHECK: stb {{[0-9]+}}, 103(1)
+; CHECK: sth {{[0-9]+}}, 101(1)
+; CHECK: stw {{[0-9]+}}, 97(1)
+; CHECK: ld 9, 96(1)
+; CHECK: ld 8, 88(1)
+; CHECK: ld 7, 80(1)
+; CHECK: lwz 6, 152(31)
+; CHECK: ld 5, 64(1)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+define internal i32 @callee2(%struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
+; CHECK: std 5, 64(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lbz {{[0-9]+}}, 85(1)
+; CHECK: lbz {{[0-9]+}}, 86(1)
+; CHECK: lbz {{[0-9]+}}, 83(1)
+; CHECK: lbz {{[0-9]+}}, 84(1)
+; CHECK: lbz {{[0-9]+}}, 69(1)
+; CHECK: lbz {{[0-9]+}}, 70(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lhz {{[0-9]+}}, 90(1)
+; CHECK: lhz {{[0-9]+}}, 92(1)
+; CHECK: lbz {{[0-9]+}}, 99(1)
+; CHECK: lbz {{[0-9]+}}, 100(1)
+; CHECK: lbz {{[0-9]+}}, 97(1)
+; CHECK: lbz {{[0-9]+}}, 98(1)
+}
diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
new file mode 100644
index 0000000000000..fb1835f580b27
--- /dev/null
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.Sf1 = type { float }
+
+define void @foo(float inreg %s.coerce) nounwind {
+entry:
+  %s = alloca %struct.Sf1, align 4
+  %coerce.dive = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  store float %s.coerce, float* %coerce.dive, align 1
+  %coerce.dive1 = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  %0 = load float* %coerce.dive1, align 1
+  call void (i32, ...)* @testvaSf1(i32 1, float inreg %0)
+  ret void
+}
+
+; CHECK: stfs {{[0-9]+}}, 60(1)
+; CHECK: ld 4, 56(1)
+; CHECK: bl
+
+declare void @testvaSf1(i32, ...)
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
new file mode 100644
index 0000000000000..3180f464d1250
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -0,0 +1,527 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector comparisons using altivec. For non native types, just basic
+; comparison instruction check is done. For altivec supported type (16i8,
+; 8i16, 4i32, and 4f32) all the comparisons operators (==, !=, >, >=, <, <=)
+; are checked.
+
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <2 x i8> @v2si8_cmp(<2 x i8> %x, <2 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i8> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %sext
+}
+; CHECK: v2si8_cmp:
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <4 x i8> @v4si8_cmp(<4 x i8> %x, <4 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <4 x i8> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %sext
+}
+; CHECK: v4si8_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <8 x i8> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %sext
+}
+; CHECK: v8si8_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v16i8 since it is a altivec native type
+
+define <16 x i8> @v16si8_cmp_eq(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_eq:
+; CHECK: vcmpequb 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ne(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:     v16si8_cmp_ne:
+; CHECK:     vcmpequb [[RET:[0-9]+]], 2, 3
+; CHECK-NOR: vnor     2, [[RET]], [[RET]]
+
+define <16 x i8> @v16si8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_lt:
+; CHECK: vcmpgtsb 2, 3, 2
+
+define <16 x i8> @v16ui8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_lt:
+; CHECK: vcmpgtub 2, 3, 2
+
+define <16 x i8> @v16si8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_gt:
+; CHECK: vcmpgtsb 2, 2, 3
+
+define <16 x i8> @v16ui8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_gt:
+; CHECK: vcmpgtub 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i8> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i8>
+  ret <32 x i8> %sext
+}
+; CHECK: v32si8_cmp:
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x i16> @v2si16_cmp(<2 x i16> %x, <2 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i16> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %sext
+}
+; CHECK: v2si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <4 x i16> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %sext
+}
+; CHECK: v4si16_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v8i16 since it is an altivec native type
+
+define <8 x i16> @v8si16_cmp_eq(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp eq <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_eq:
+; CHECK: vcmpequh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ne(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ne:
+; CHECK:      vcmpequh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <8 x i16> @v8si16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_lt:
+; CHECK: vcmpgtsh 2, 3, 2
+
+define <8 x i16> @v8ui16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_lt:
+; CHECK: vcmpgtuh 2, 3, 2
+
+define <8 x i16> @v8si16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_gt:
+; CHECK: vcmpgtsh 2, 2, 3
+
+define <8 x i16> @v8ui16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_gt:
+; CHECK: vcmpgtuh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i16> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %sext
+}
+; CHECK: v16si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <32 x i16> @v32si16_cmp(<32 x i16> %x, <32 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i16> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i16>
+  ret <32 x i16> %sext
+}
+; CHECK: v32si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x i32> @v2si32_cmp(<2 x i32> %x, <2 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i32> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %sext
+}
+; CHECK: v2si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v4si32 since it is an altivec native type
+
+define <4 x i32> @v4si32_cmp_eq(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp eq <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_eq:
+; CHECK: vcmpequw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ne(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ne:
+; CHECK:      vcmpequw [[RCMP:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RCMP]], [[RCMP]]
+
+define <4 x i32> @v4si32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_lt:
+; CHECK: vcmpgtsw 2, 3, 2
+
+define <4 x i32> @v4ui32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_lt:
+; CHECK: vcmpgtuw 2, 3, 2
+
+define <4 x i32> @v4si32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_gt:
+; CHECK: vcmpgtsw 2, 2, 3
+
+define <4 x i32> @v4ui32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_gt:
+; CHECK: vcmpgtuw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <8 x i32> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %sext
+}
+; CHECK: v8si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <16 x i32> @v16si32_cmp(<16 x i32> %x, <16 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i32> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %sext
+}
+; CHECK: v16si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <32 x i32> @v32si32_cmp(<32 x i32> %x, <32 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i32> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i32>
+  ret <32 x i32> %sext
+}
+; CHECK: v32si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x float> @v2f32_cmp(<2 x float> %x, <2 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <2 x float> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  %0 = bitcast <2 x i32> %sext to <2 x float>
+  ret <2 x float> %0
+}
+; CHECK: v2f32_cmp:
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v4f32 since it is a altivec native type
+
+define <4 x float> @v4f32_cmp_eq(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_eq:
+; CHECK: vcmpeqfp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_ne(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp une <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_ne:
+; CHECK:      vcmpeqfp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_le(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_le:
+; CHECK:      vcmpeqfp [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtfp [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x float> @v4f32_cmp_lt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp olt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_lt:
+; CHECK: vcmpgtfp 2, 3, 2
+
+define <4 x float> @v4f32_cmp_ge(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oge <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_ge:
+; CHECK: vcmpgefp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_gt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ogt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_gt:
+; CHECK: vcmpgtfp 2, 2, 3
+
+
+define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <8 x float> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  %0 = bitcast <8 x i32> %sext to <8 x float>
+  ret <8 x float> %0
+}
+; CHECK: v8f32_cmp:
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vec_conv.ll b/test/CodeGen/PowerPC/vec_conv.ll
new file mode 100644
index 0000000000000..a475e9499df22
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mattr=+altivec < %s | FileCheck %s
+
+; Check vector float/int conversion using altivec.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@cte_float = global <4 x float> <float 6.5e+00, float 6.5e+00, float 6.5e+00, float 6.5e+00>, align 16
+@cte_int = global <4 x i32> <i32 6, i32 6, i32 6, i32 6>, align 16
+
+
+define void @v4f32_to_v4i32(<4 x float> %x, <4 x i32>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x float>* @cte_float, align 16
+  %mul = fmul <4 x float> %0, %x
+  %1 = fptosi <4 x float> %mul to <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %y, align 16
+  ret void
+}
+;CHECK: v4f32_to_v4i32:
+;CHECK: vctsxs {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4f32_to_v4u32(<4 x float> %x, <4 x i32>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x float>* @cte_float, align 16
+  %mul = fmul <4 x float> %0, %x
+  %1 = fptoui <4 x float> %mul to <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %y, align 16
+  ret void
+}
+;CHECK: v4f32_to_v4u32:
+;CHECK: vctuxs {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4i32_to_v4f32(<4 x i32> %x, <4 x float>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x i32>* @cte_int, align 16
+  %mul = mul <4 x i32> %0, %x
+  %1 = sitofp <4 x i32> %mul to <4 x float>
+  store <4 x float> %1, <4 x float>* %y, align 16
+  ret void
+}
+;CHECK: v4i32_to_v4f32:
+;CHECK: vcfsx {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4u32_to_v4f32(<4 x i32> %x, <4 x float>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x i32>* @cte_int, align 16
+  %mul = mul <4 x i32> %0, %x
+  %1 = uitofp <4 x i32> %mul to <4 x float>
+  store <4 x float> %1, <4 x float>* %y, align 16
+  ret void
+}
+;CHECK: v4u32_to_v4f32:
+;CHECK: vcfux {{[0-9]+}}, {{[0-9]+}}, 0
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
new file mode 100644
index 0000000000000..201c15b9c7359
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector extend load expansion with altivec enabled.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Altivec does not provides an sext intruction, so it expands
+; a set of vector stores (stvx), bytes load/sign expand/store
+; (lbz/stb), and a final vector load (lvx) to load the result
+; extended vector.
+define <16 x i8> @v16si8_sext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = sext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK: v16si8_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; The zero extend uses a more clever logic: a vector splat
+; and a logic and to set higher bits to 0.
+define <16 x i8> @v16si8_zext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = zext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK:      v16si8_zext_in_reg:
+; CHECK:      vspltisb [[VMASK:[0-9]+]], 15
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load/store halfwords (lhz/sth).
+define <8 x i16> @v8si16_sext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = sext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK: v8si16_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg, but instead of creating the mask
+; with a splat, loads it from memory.
+define <8 x i16> @v8si16_zext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = zext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK:      v8si16_zext_in_reg:
+; CHECK:      ld [[RMASKTOC:[0-9]+]], .LC{{[0-9]+}}@toc(2)
+; CHECK-NEXT: lvx [[VMASK:[0-9]+]], {{[0-9]+}}, [[RMASKTOC]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load halfword (lha) and
+; store words (stw).
+define <4 x i32> @v4si32_sext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = sext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK: v4si32_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg.
+define <4 x i32> @v4si32_zext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = zext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK:      v4si32_zext_in_reg:
+; CHECK:      vspltisw [[VMASK:[0-9]+]], -16
+; CHECK-NEXT: vsrw [[VMASK]], [[VMASK]], [[VMASK]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
diff --git a/test/CodeGen/PowerPC/vec_sqrt.ll b/test/CodeGen/PowerPC/vec_sqrt.ll
new file mode 100644
index 0000000000000..055da1a229d17
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_sqrt.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec,+fsqrt < %s | FileCheck %s
+
+; Check for vector sqrt expansion using floating-point types, since altivec
+; does not provide an fsqrt instruction for vector.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %val)
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float> %val)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double> %val)
+
+define <2 x float> @v2f32_sqrt(<2 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32 (<2 x float> %x)
+  ret <2 x float> %sqrt
+}
+; sqrt (<2 x float>) is promoted to sqrt (<4 x float>)
+; CHECK: v2f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x float> @v4f32_sqrt(<4 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %x)
+  ret <4 x float> %sqrt
+}
+; CHECK: v4f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <8 x float> @v8f32_sqrt(<8 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %x)
+  ret <8 x float> %sqrt
+}
+; CHECK: v8f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <2 x double> @v2f64_sqrt(<2 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %x)
+  ret <2 x double> %sqrt
+}
+; CHECK: v2f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x double> @v4f64_sqrt(<4 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %x)
+  ret <4 x double> %sqrt
+}
+; CHECK: v4f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vrspill.ll b/test/CodeGen/PowerPC/vrspill.ll
new file mode 100644
index 0000000000000..7641017c434ef
--- /dev/null
+++ b/test/CodeGen/PowerPC/vrspill.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs  < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs < %s | FileCheck %s
+
+; This verifies that we generate correct spill/reload code for vector regs.
+
+define void @addrtaken(i32 %i, <4 x float> %w) nounwind {
+entry:
+  %i.addr = alloca i32, align 4
+  %w.addr = alloca <4 x float>, align 16
+  store i32 %i, i32* %i.addr, align 4
+  store <4 x float> %w, <4 x float>* %w.addr, align 16
+  call void @foo(i32* %i.addr)
+  ret void
+}
+
+; CHECK: stvx 2, 0, 0
+; CHECK: lvx 2, 0, 0
+
+declare void @foo(i32*)
diff --git a/test/CodeGen/SPARC/2011-01-11-CC.ll b/test/CodeGen/SPARC/2011-01-11-CC.ll
index 3ceda958de6e4..f676fd836947b 100755
--- a/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -54,7 +54,7 @@ entry:
 ; V8: {{be|bne}}
 ; V9: test_select_dfp_icc
 ; V9: subcc
-; V9=NOT: {{be|bne}}
+; V9-NOT: {{be|bne}}
 ; V9: fmovd{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, double %f1, double %f2
diff --git a/test/CodeGen/Thumb2/buildvector-crash.ll b/test/CodeGen/Thumb2/buildvector-crash.ll
index 01ef472d31046..ce42f4b3773d8 100644
--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.
 
 define void @RotateStarsFP_Vec() nounwind {
@@ -13,5 +13,5 @@ bb8:                                              ; preds = %bb8, %bb.nph372
   store <4 x float> %3, <4 x float>* undef, align 4
   br label %bb8
 ; CHECK: RotateStarsFP_Vec:
-; CHECK: vldmia
+; CHECK: vld1.64
 }
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index de6f6e260de3d..85b4370fa5997 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -20,3 +20,16 @@ entry:
 	%tmp2 = sub i64 %tmp1, %b
 	ret i64 %tmp2
 }
+
+; rdar://12559385
+define i64 @f3(i32 %vi) {
+entry:
+; CHECK: f3:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbcs r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index d06f8a7beeb01..b7df2fbf546cd 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -7,8 +7,8 @@ define float @foo(float %a, float %b) {
 entry:
 ; CHECK: foo
 ; CORTEXM3: blx ___mulsf3
-; CORTEXM4: vmul.f32  s0, s1, s0
-; CORTEXA8: vmul.f32  d0, d1, d0
+; CORTEXM4: vmul.f32  s0, s2, s0
+; CORTEXA8: vmul.f32  d
   %0 = fmul float %a, %b
   ret float %0
 }
@@ -19,6 +19,6 @@ entry:
   %0 = fmul double %a, %b
 ; CORTEXM3: blx ___muldf3
 ; CORTEXM4: blx ___muldf3
-; CORTEXA8: vmul.f64  d16, d17, d16
+; CORTEXA8: vmul.f64  d
   ret double %0
 }
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index 2c00c70c0db69..f89746a303277 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -2,6 +2,8 @@
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMB
 ; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
+; RUN: llc < %s -march=thumb -mcpu=swift \
+; RUN:    | FileCheck %s -check-prefix=CHECK-SWIFT-T2
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
@@ -9,6 +11,8 @@ entry:
 ; CHECK-THUMB: __divsi3
 ; CHECK-THUMBV7M: f1
 ; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f1
+; CHECK-SWIFT-T2: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -19,6 +23,8 @@ entry:
 ; CHECK-THUMB: __udivsi3
 ; CHECK-THUMBV7M: f2
 ; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f2
+; CHECK-SWIFT-T2: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -29,6 +35,8 @@ entry:
 ; CHECK-THUMB: __modsi3
 ; CHECK-THUMBV7M: f3
 ; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f3
+; CHECK-SWIFT-T2: sdiv
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -39,6 +47,8 @@ entry:
 ; CHECK-THUMB: __umodsi3
 ; CHECK-THUMBV7M: f4
 ; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f4
+; CHECK-SWIFT-T2: udiv
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
new file mode 100644
index 0000000000000..beefd6044cf47
--- /dev/null
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index c4cc749ea5c7a..594d9742b0f94 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -7,6 +8,9 @@ define i32 @f1(i32 %a, i32 %b, i32 %c) {
 }
 ; CHECK: f1:
 ; CHECK: 	mla	r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
 
 define i32 @f2(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -15,3 +19,6 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
 }
 ; CHECK: f2:
 ; CHECK: 	mla	r0, r0, r1, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index ead198f21624e..ed4d26d746cbc 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -5,7 +5,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: mvn r0, #-2147483648
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: addle.w r1, r1, r0
+; CHECK: addle r1, r0
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -30,7 +30,7 @@ define i32 @t3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t3
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: suble.w r1, r1, #10
+; CHECK: suble r1, #10
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 10
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index c128eccd662ff..aaaedfa42e741 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,8 +1,12 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
 ; CHECK: smlabt r0, r1, r2, r0
+; NO_MULOPS: f3
+; NO_MULOPS: smultb r1, r2, r1
+; NO_MULOPS-NEXT: add r0, r1
         %tmp = sext i16 %x to i32               ; <i32> [#uses=1]
         %tmp2 = ashr i32 %y, 16         ; <i32> [#uses=1]
         %tmp3 = mul i32 %tmp2, %tmp             ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 35914b16790a1..2074f98cb608c 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) {
 
 ; ARMv7M: test10
 ; ARMv7M: mov.w r1, #16253176
-; ARMv7M: mov.w r2, #458759
 ; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: and.w r1, r2, r0, lsr #5
+; ARMv7M: mov.w r1, #458759
+; ARMv7M: and.w r1, r1, r0, lsr #5
 ; ARMv7M: orrs r0, r1
 	%tmp1 = lshr i32 %p0, 7		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16253176		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index 8b55bd79aaa5f..3d058bc28965c 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://r7512579
 
 ; PHI defs in the atomic loop should be used by the add / adc
@@ -7,17 +7,16 @@
 define void @t(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK: t:
-; CHECK: movl $1
-; CHECK: movl (%ebp), %eax
-; CHECK: movl 4(%ebp), %edx
+; CHECK: movl ([[REG:%[a-z]+]]), %eax
+; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
-; CHECK-NOT: movl $1
-; CHECK-NOT: movl $0
-; CHECK: addl
-; CHECK: adcl
+; CHECK: movl %eax, %ebx
+; CHECK: addl {{%[a-z]+}}, %ebx
+; CHECK: movl %edx, %ecx
+; CHECK: adcl {{%[a-z]+}}, %ecx
 ; CHECK: lock
-; CHECK: cmpxchg8b
-; CHECK: jne
+; CHECK-NEXT: cmpxchg8b ([[REG]])
+; CHECK-NEXT: jne
   %0 = atomicrmw add i64* %p, i64 1 seq_cst
   ret void
 }
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 8a3ccc8dfda50..3ce7db6e41383 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -2,8 +2,8 @@
 
 ;CHECK: vcast
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pshufd
-;CHECK: pshufd
+;CHECK: pmovzxdq
+;CHECK: pmovzxdq
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index fec17e9f4acab..c4b307e5a5d32 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -4,7 +4,7 @@
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: shufb
+; CHECK: pmovzxbd
   ret <4 x i8> %out
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b670c7af4..04659522d3604 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index 906b748fa4208..4abdded38d8cc 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -3,7 +3,7 @@
 ; CHECK: load_store
 define void @load_store(<4 x i16>* %in) {
 entry:
-; CHECK: movsd
+; CHECK: pmovzxwd
   %A27 = load <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
 ; CHECK: movlpd
@@ -27,6 +27,6 @@ define <2 x i32> @load_64(<2 x i32>* %ptr) {
 BB:
   %t = load <2 x i32>* %ptr
   ret <2 x i32> %t
-;CHECK: movsd
+;CHECK: pmovzxdq
 ;CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
new file mode 100644
index 0000000000000..ed511567c32b4
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; rdar://12081007
+
+; CHECK: and_1:
+; CHECK: andb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: and_2:
+; CHECK: andb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
+
+; CHECK: xor_1:
+; CHECK: xorb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: xor_2:
+; CHECK: xorb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000000000..6ebbb2e97d139
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
new file mode 100644
index 0000000000000..7b9bab97be6f6
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; Make sure we are not trying to use scalar xor on the high bits of the vector.
+; CHECK-NOT: xorq
+; CHECK: xorl
+; CHECK-NEXT: ret
+
+define i32 @foo() {
+bb:
+  %tmp44.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>
+  %0 = bitcast <4 x float> %tmp44.i to i128
+  %1 = zext i128 %0 to i512
+  %2 = shl nuw nsw i512 %1, 256
+  %ins = or i512 %2, 3325764857622480139933400731976840738652108318779753826115024029985671937147149347761402413803120180680770390816681124225944317364750115981129923635970048
+  store i512 %ins, i512* undef, align 64
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/2012-09-28-CGPBug.ll b/test/CodeGen/X86/2012-09-28-CGPBug.ll
new file mode 100644
index 0000000000000..32d7d012dd147
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-28-CGPBug.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=i386-apple-macosx < %s | FileCheck %s
+; rdar://12396696
+
+@JT = global [4 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@h, %18) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %17) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %18) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %17) to i32))]
+@gGlobalLock = external global i8*
+@.str40 = external global [35 x i8]
+
+; CHECK: _JT:
+; CHECK-NOT: .long Ltmp{{[0-9]+}}-1
+; CHECK-NOT: .long 1-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+
+define void @h(i8*) nounwind ssp {
+  %2 = alloca i8*
+  store i8* %0, i8** %2
+  %3 = load i8** %2
+  %4 = bitcast i8* %3 to { i32, i32 }*
+  %5 = getelementptr { i32, i32 }* %4, i32 0, i32 0
+  %6 = load i32* %5
+  %7 = srem i32 %6, 2
+  %8 = icmp slt i32 %6, 2
+  %9 = select i1 %8, i32 %6, i32 %7
+  %10 = icmp eq i32 %9, 0
+  br label %11
+
+; <label>:11                                      ; preds = %1
+  %12 = zext i1 %10 to i32
+  %13 = getelementptr [4 x i32]* @JT, i32 0, i32 %12
+  %14 = load i32* %13
+  %15 = add i32 %14, ptrtoint (i8* blockaddress(@h, %11) to i32)
+  %16 = inttoptr i32 %15 to i8*
+  indirectbr i8* %16, [label %17, label %18]
+
+; <label>:17                                      ; preds = %11
+  tail call void (i8*, ...)* @g(i8* getelementptr inbounds ([35 x i8]* @.str40, i32 0, i32 0))
+  br label %22
+
+; <label>:18                                      ; preds = %11
+  %19 = call i32 @f(i32 -1037694186) nounwind
+  %20 = inttoptr i32 %19 to i32 (i8**)*
+  %21 = tail call i32 %20(i8** @gGlobalLock)
+  br label %22
+
+; <label>:22                                      ; preds = %18, %17
+  ret void
+}
+
+declare i32 @f(i32)
+
+declare void @g(i8*, ...)
diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
new file mode 100644
index 0000000000000..8d914db3315f6
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+
+; rdar://12393897
+
+%TRp = type { i32, %TRH*, i32, i32 }
+%TRH = type { i8*, i8*, i8*, i8*, {}* }
+
+define i32 @t(%TRp* inreg %rp) nounwind optsize ssp {
+entry:
+  %handler = getelementptr inbounds %TRp* %rp, i32 0, i32 1
+  %0 = load %TRH** %handler, align 4
+  %sync = getelementptr inbounds %TRH* %0, i32 0, i32 4
+  %sync12 = load {}** %sync, align 4
+  %1 = bitcast {}* %sync12 to i32 (%TRp*)*
+  %call = tail call i32 %1(%TRp* inreg %rp) nounwind optsize
+  ret i32 %call
+}
+
+%btConeShape = type { %btConvexInternalShape, float, float, float, [3 x i32] }
+%btConvexInternalShape = type { %btConvexShape, %btVector, %btVector, float, float }
+%btConvexShape = type { %btCollisionShape }
+%btCollisionShape = type { i32 (...)**, i32, i8* }
+%btVector = type { [4 x float] }
+
+define { <2 x float>, <2 x float> } @t2(%btConeShape* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %0 = getelementptr inbounds %btConeShape* %this, i64 0, i32 0
+  br i1 undef, label %if.then, label %if.end17
+
+if.then:                                          ; preds = %entry
+  %vecnorm.sroa.2.8.copyload = load float* undef, align 4
+  %cmp4 = fcmp olt float undef, 0x3D10000000000000
+  %vecnorm.sroa.2.8.copyload36 = select i1 %cmp4, float -1.000000e+00, float %vecnorm.sroa.2.8.copyload
+  %call.i.i.i = tail call float @sqrtf(float 0.000000e+00) nounwind readnone
+  %div.i.i = fdiv float 1.000000e+00, %call.i.i.i
+  %mul7.i.i.i = fmul float %div.i.i, %vecnorm.sroa.2.8.copyload36
+  %1 = load float (%btConvexInternalShape*)** undef, align 8
+  %call12 = tail call float %1(%btConvexInternalShape* %0)
+  %mul7.i.i = fmul float %call12, %mul7.i.i.i
+  %retval.sroa.0.4.insert = insertelement <2 x float> zeroinitializer, float undef, i32 1
+  %add13.i = fadd float undef, %mul7.i.i
+  %retval.sroa.1.8.insert = insertelement <2 x float> undef, float %add13.i, i32 0
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.then, %entry
+  %retval.sroa.1.8.load3338 = phi <2 x float> [ %retval.sroa.1.8.insert, %if.then ], [ undef, %entry ]
+  %retval.sroa.0.0.load3137 = phi <2 x float> [ %retval.sroa.0.4.insert, %if.then ], [ undef, %entry ]
+  ret { <2 x float>, <2 x float> } undef
+}
+
+declare float @sqrtf(float) nounwind readnone
diff --git a/test/CodeGen/X86/2012-10-03-DAGCycle.ll b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
new file mode 100644
index 0000000000000..72083c7115e43
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=corei7 < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.pluto.0 = type { %struct.bar.1, %struct.hoge.368* }
+%struct.bar.1 = type { %i8* }
+%i8 = type { i8 }
+%struct.hoge.368 = type { i32, i32 }
+%struct.widget.375 = type { i32, i32, %i8*, %struct.hoge.368* }
+
+define fastcc void @bar(%struct.pluto.0* %arg) nounwind uwtable ssp align 2 {
+bb:
+  %tmp1 = alloca %struct.widget.375, align 8
+  %tmp2 = getelementptr inbounds %struct.pluto.0* %arg, i64 0, i32 1
+  %tmp3 = load %struct.hoge.368** %tmp2, align 8
+  store %struct.pluto.0* %arg, %struct.pluto.0** undef, align 8
+  %tmp = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 2
+  %tmp4 = getelementptr %struct.pluto.0* %arg, i64 0, i32 0, i32 0
+  %tmp5 = load %i8** %tmp4, align 8
+  store %i8* %tmp5, %i8** %tmp, align 8
+  %tmp6 = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 3
+  store %struct.hoge.368* %tmp3, %struct.hoge.368** %tmp6, align 8
+  br i1 undef, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb
+  unreachable
+
+bb8:                                              ; preds = %bb
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
new file mode 100644
index 0000000000000..5b98624a37b8b
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s
+
+; We should not crash on this test.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin9.0.0"
+
+@global = external constant [411 x i8], align 1
+
+define void @snork() nounwind {
+bb:
+  br i1 undef, label %bb26, label %bb27
+
+bb26:                                             ; preds = %bb48, %bb26, %bb
+  switch i32 undef, label %bb26 [
+    i32 142771596, label %bb28
+  ]
+
+bb27:                                             ; preds = %bb48, %bb
+  switch i32 undef, label %bb49 [
+    i32 142771596, label %bb28
+  ]
+
+bb28:                                             ; preds = %bb27, %bb26
+  %tmp = load i32* null
+  %tmp29 = trunc i32 %tmp to i8
+  store i8* undef, i8** undef
+  %tmp30 = load i32* null
+  %tmp31 = icmp eq i32 %tmp30, 0
+  %tmp32 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 undef
+  %tmp33 = load i8* %tmp32, align 1
+  %tmp34 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 0
+  %tmp35 = load i8* %tmp34, align 1
+  %tmp36 = select i1 %tmp31, i8 %tmp35, i8 %tmp33
+  %tmp37 = select i1 undef, i8 %tmp29, i8 %tmp36
+  %tmp38 = zext i8 %tmp37 to i32
+  %tmp39 = select i1 undef, i32 0, i32 %tmp38
+  %tmp40 = getelementptr inbounds i32* null, i32 %tmp39
+  %tmp41 = load i32* %tmp40, align 4
+  %tmp42 = load i32* undef, align 4
+  %tmp43 = load i32* undef
+  %tmp44 = xor i32 %tmp42, %tmp43
+  %tmp45 = lshr i32 %tmp44, 8
+  %tmp46 = lshr i32 %tmp44, 7
+  call void @spam()
+  unreachable
+
+bb47:                                             ; No predecessors!
+  ret void
+
+bb48:                                             ; No predecessors!
+  br i1 undef, label %bb27, label %bb26
+
+bb49:                                             ; preds = %bb49, %bb27
+  br label %bb49
+
+bb50:                                             ; preds = %bb50
+  br label %bb50
+}
+
+declare void @spam() noreturn nounwind
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
new file mode 100644
index 0000000000000..64825bac97190
--- /dev/null
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -0,0 +1,305 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; CHECK: merge_const_store
+; save 1,2,3 ... as one big integer.
+; CHECK: movabsq $578437695752307201
+; CHECK: ret
+define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 5, i8* %6, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the constants using a single vector store.
+; CHECK: merge_const_store_vec
+; CHECK: vmovups  %ymm0, (%rsi)
+; CHECK: ret
+define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 0, i32* %3, align 4
+  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  store i32 0, i32* %4, align 4
+  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  store i32 0, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
+  store i32 0, i32* %6, align 4
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
+  store i32 0, i32* %7, align 4
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
+  store i32 0, i32* %8, align 4
+  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
+  store i32 0, i32* %9, align 4
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the first 4 constants as a single vector. Move the rest as scalars.
+; CHECK: merge_nonconst_store
+; CHECK: movl $67305985
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+
+;CHECK: merge_loads_i16
+; load:
+;CHECK: movw
+; store:
+;CHECK: movw
+;CHECK: ret
+define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i8* %2, align 1
+  %6 = load i8* %3, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %5, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 %6, i8* %8, align 1
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+; The loads and the stores are interleved. Can't merge them.
+;CHECK: no_merge_loads
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: ret
+define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %a4
+
+a4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
+  %a5 = load i8* %2, align 1
+  %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %a5, i8* %a7, align 1
+  %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  %a6 = load i8* %3, align 1
+  store i8 %a6, i8* %a8, align 1
+  %a9 = add nsw i32 %i.02, 1
+  %a10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %a9, %count
+  br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_integer
+; load:
+;CHECK: movq
+; store:
+;CHECK: movq
+;CHECK: ret
+define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i32* %2
+  %6 = load i32* %3
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 %5, i32* %7
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 %6, i32* %8
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_vector
+; load:
+;CHECK: movups
+; store:
+;CHECK: movups
+;CHECK: ret
+define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2
+  %b2 = load i32* %a3
+  %b3 = load i32* %a4
+  %b4 = load i32* %a5
+  store i32 %b1, i32* %a7
+  store i32 %b2, i32* %a8
+  store i32 %b3, i32* %a9
+  store i32 %b4, i32* %a10
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+;CHECK: merge_loads_no_align
+; load:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+; store:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: ret
+define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2, align 1
+  %b2 = load i32* %a3, align 1
+  %b3 = load i32* %a4, align 1
+  %b4 = load i32* %a5, align 1
+  store i32 %b1, i32* %a7, align 1
+  store i32 %b2, i32* %a8, align 1
+  store i32 %b3, i32* %a9, align 1
+  store i32 %b4, i32* %a10, align 1
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
new file mode 100644
index 0000000000000..5982544f7a8c7
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s
+
+; Make sure that we don't crash when dbg values are used.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define void @foo() nounwind uwtable ssp {
+entry:
+  %x.i = alloca i8, align 1
+  %y.i = alloca [256 x i8], align 16
+  %0 = getelementptr inbounds [256 x i8]* %y.i, i64 0, i64 0
+  br label %for.body
+
+for.body:
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
+  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+  br label %for.body
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!16 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!2 = metadata !{i32 0}
+!22 = metadata !{i32 786688, metadata !2, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000000000..f8ae74f292d22
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,410 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Check that we don't assert and crash even when there are allocas
+; outside the declared lifetime regions.
+;YESCOLOR: bad_range
+;NOCOLOR:  bad_range
+define void @bad_range() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  br label %block2
+
+block2:
+  ; I am used outside the marked lifetime.
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  ret void
+}
+
+
+; Check that we don't assert and crash even when there are usages
+; of allocas which do not read or write outside the declared lifetime regions.
+;YESCOLOR: shady_range
+;NOCOLOR:  shady_range
+
+%struct.Klass = type { i32, i32 }
+
+define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
+  %a.i = alloca [4 x %struct.Klass], align 16
+  %b.i = alloca [4 x %struct.Klass], align 16
+  %a8 = bitcast [4 x %struct.Klass]* %a.i to i8*
+  %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
+  ; I am used outside the lifetime zone below:
+  %z2 = getelementptr inbounds [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  %z3 = load i32* %z2, align 16
+  %r = call i32 @foo(i32 %z3, i8* %a8)
+  %r2 = call i32 @foo(i32 %z3, i8* %b8)
+  call void @llvm.lifetime.end(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  ret i32 9
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index a4abccba7e689..4e30f2b05a89f 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -30,4 +30,17 @@ entry:
   ret i32 %z.0
 }
 
+; <rdar://problem/12579915>
+define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ugt i32 %x, %y
+  %dec = sext i1 %cmp to i32
+  %dec.res = add nsw i32 %dec, %res
+  ret i32 %dec.res
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK: sbbl
+; CHECK: ret
+}
+
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000000000..e7c9605d3e887
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/atom-shuf.ll b/test/CodeGen/X86/atom-shuf.ll
new file mode 100644
index 0000000000000..4c3f2f67c54b5
--- /dev/null
+++ b/test/CodeGen/X86/atom-shuf.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=atom | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %in) {
+  %r = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> < i32 7, i32 3, i32 2, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %r
+; CHECK: foo
+; CHECK: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
new file mode 100644
index 0000000000000..e3ef605f7f1cf
--- /dev/null
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=x86 -mattr=+cmov -mtriple=i386-pc-linux < %s | FileCheck %s -check-prefix=LINUX
+; RUN: llc -march=x86 -mtriple=i386-macosx -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC
+
+@sc64 = external global i64
+
+define void @atomic_maxmin_i6432() {
+; LINUX: atomic_maxmin_i6432
+  %1 = atomicrmw max  i64* @sc64, i64 5 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %2 = atomicrmw min  i64* @sc64, i64 6 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %3 = atomicrmw umax i64* @sc64, i64 7 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %4 = atomicrmw umin i64* @sc64, i64 8 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  ret void
+}
+
+; rdar://12453106
+@id = internal global i64 0, align 8
+
+define void @tf_bug(i8* %ptr) nounwind {
+; PIC: tf_bug:
+; PIC: movl _id-L1$pb(
+; PIC: movl (_id-L1$pb)+4(
+  %tmp1 = atomicrmw add i64* @id, i64 1 seq_cst
+  %tmp2 = add i64 %tmp1, 1
+  %tmp3 = bitcast i8* %ptr to i64*
+  store i64 %tmp2, i64* %tmp3, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic-pointer.ll b/test/CodeGen/X86/atomic-pointer.ll
new file mode 100644
index 0000000000000..a455277be4dbf
--- /dev/null
+++ b/test/CodeGen/X86/atomic-pointer.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-none-linux | FileCheck %s
+
+define i32* @test_atomic_ptr_load(i32** %a0) {
+; CHECK: test_atomic_ptr_load
+; CHECK: movl
+; CHECK: movl
+; CHECK: ret
+0:
+  %0 = load atomic i32** %a0 seq_cst, align 4
+  ret i32* %0
+}
+
+define void @test_atomic_ptr_store(i32* %a0, i32** %a1) {
+; CHECK: test_atomic_ptr_store
+; CHECK: movl
+; CHECK: movl
+; CHECK: xchgl
+; CHECK: ret
+0:
+  store atomic i32* %a0, i32** %a1 seq_cst, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
new file mode 100644
index 0000000000000..824995d6cb98a
--- /dev/null
+++ b/test/CodeGen/X86/atomic16.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mcpu=corei7 -show-mc-encoding | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc16 = external global i16
+
+define void @atomic_fetch_add16() nounwind {
+; X64:   atomic_fetch_add16
+; X32:   atomic_fetch_add16
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       incw
+; X32:       lock
+; X32:       incw
+  %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       addw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw $3
+  %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       addw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub16() nounwind {
+; X64:   atomic_fetch_sub16
+; X32:   atomic_fetch_sub16
+  %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       decw
+; X32:       lock
+; X32:       decw
+  %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       subw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw $3
+  %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       subw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and16() nounwind {
+; X64:   atomic_fetch_and16
+; X32:   atomic_fetch_and16
+  %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw $3
+  %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
+; X64:       andw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       andw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or16() nounwind {
+; X64:   atomic_fetch_or16
+; X32:   atomic_fetch_or16
+  %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw $3
+  %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
+; X64:       orw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       orw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       orw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor16() nounwind {
+; X64:   atomic_fetch_xor16
+; X32:   atomic_fetch_xor16
+  %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw $3
+  %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
+; X64:       xorw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       xorw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       xorw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand16(i16 %x) nounwind {
+; X64:   atomic_fetch_nand16
+; X32:   atomic_fetch_nand16
+  %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
+; X64:       andw
+; X64:       notw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       notw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max16(i16 %x) nounwind {
+  %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min16(i16 %x) nounwind {
+  %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax16(i16 %x) nounwind {
+  %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin16(i16 %x) nounwind {
+  %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg16() nounwind {
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store16(i16 %x) nounwind {
+  store atomic i16 %x, i16* @sc16 release, align 4
+; X64-NOT:   lock
+; X64:       movw
+; X32-NOT:   lock
+; X32:       movw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap16(i16 %x) nounwind {
+  %t1 = atomicrmw xchg i16* @sc16, i16 %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
new file mode 100644
index 0000000000000..dc927d8cb6f6f
--- /dev/null
+++ b/test/CodeGen/X86/atomic32.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc32 = external global i32
+
+define void @atomic_fetch_add32() nounwind {
+; X64:   atomic_fetch_add32
+; X32:   atomic_fetch_add32
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       incl
+; X32:       lock
+; X32:       incl
+  %t2 = atomicrmw add  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       addl $3
+; X32:       lock
+; X32:       addl $3
+  %t3 = atomicrmw add  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw add  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       addl
+; X32:       lock
+; X32:       addl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub32() nounwind {
+; X64:   atomic_fetch_sub32
+; X32:   atomic_fetch_sub32
+  %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       decl
+; X32:       lock
+; X32:       decl
+  %t2 = atomicrmw sub  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       subl $3
+; X32:       lock
+; X32:       subl $3
+  %t3 = atomicrmw sub  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw sub  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       subl
+; X32:       lock
+; X32:       subl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and32() nounwind {
+; X64:   atomic_fetch_and32
+; X32:   atomic_fetch_and32
+  %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       andl $3
+; X32:       lock
+; X32:       andl $3
+  %t2 = atomicrmw and  i32* @sc32, i32 5 acquire
+; X64:       andl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw and  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       andl
+; X32:       lock
+; X32:       andl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or32() nounwind {
+; X64:   atomic_fetch_or32
+; X32:   atomic_fetch_or32
+  %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       orl $3
+; X32:       lock
+; X32:       orl $3
+  %t2 = atomicrmw or   i32* @sc32, i32 5 acquire
+; X64:       orl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw or   i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       orl
+; X32:       lock
+; X32:       orl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor32() nounwind {
+; X64:   atomic_fetch_xor32
+; X32:   atomic_fetch_xor32
+  %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       xorl $3
+; X32:       lock
+; X32:       xorl $3
+  %t2 = atomicrmw xor  i32* @sc32, i32 5 acquire
+; X64:       xorl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw xor  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       xorl
+; X32:       lock
+; X32:       xorl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand32(i32 %x) nounwind {
+; X64:   atomic_fetch_nand32
+; X32:   atomic_fetch_nand32
+  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
+; X64:       andl
+; X64:       notl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max32(i32 %x) nounwind {
+  %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min32(i32 %x) nounwind {
+  %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax32(i32 %x) nounwind {
+  %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin32(i32 %x) nounwind {
+  %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg32() nounwind {
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store32(i32 %x) nounwind {
+  store atomic i32 %x, i32* @sc32 release, align 4
+; X64-NOT:   lock
+; X64:       movl
+; X32-NOT:   lock
+; X32:       movl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap32(i32 %x) nounwind {
+  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
+; X64-NOT:   lock
+; X64:       xchgl
+; X32-NOT:   lock
+; X32:       xchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
new file mode 100644
index 0000000000000..45785cc8fe525
--- /dev/null
+++ b/test/CodeGen/X86/atomic64.ll
@@ -0,0 +1,216 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X64:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       incq
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       addq $3
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       addq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X64:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       decq
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       subq $3
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       subq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X64:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       andq $3
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X64:       andq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       andq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X64:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       orq $3
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X64:       orq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       orq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X64:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       xorq $3
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X64:       xorq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       xorq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X64:   atomic_fetch_nand64
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X64:       andq
+; X64:       notq
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X64-NOT:   lock
+; X64:       movq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
new file mode 100644
index 0000000000000..f9b21c5bc75e0
--- /dev/null
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X32:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X32:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X32:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X32:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X32:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
new file mode 100644
index 0000000000000..412428406dcff
--- /dev/null
+++ b/test/CodeGen/X86/atomic8.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc8 = external global i8
+
+define void @atomic_fetch_add8() nounwind {
+; X64:   atomic_fetch_add8
+; X32:   atomic_fetch_add8
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       incb
+; X32:       lock
+; X32:       incb
+  %t2 = atomicrmw add  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       addb $3
+; X32:       lock
+; X32:       addb $3
+  %t3 = atomicrmw add  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw add  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       addb
+; X32:       lock
+; X32:       addb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub8() nounwind {
+; X64:   atomic_fetch_sub8
+; X32:   atomic_fetch_sub8
+  %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       decb
+; X32:       lock
+; X32:       decb
+  %t2 = atomicrmw sub  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       subb $3
+; X32:       lock
+; X32:       subb $3
+  %t3 = atomicrmw sub  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw sub  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       subb
+; X32:       lock
+; X32:       subb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and8() nounwind {
+; X64:   atomic_fetch_and8
+; X32:   atomic_fetch_and8
+  %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       andb $3
+; X32:       lock
+; X32:       andb $3
+  %t2 = atomicrmw and  i8* @sc8, i8 5 acquire
+; X64:       andb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw and  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       andb
+; X32:       lock
+; X32:       andb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or8() nounwind {
+; X64:   atomic_fetch_or8
+; X32:   atomic_fetch_or8
+  %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       orb $3
+; X32:       lock
+; X32:       orb $3
+  %t2 = atomicrmw or   i8* @sc8, i8 5 acquire
+; X64:       orb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       orb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw or   i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       orb
+; X32:       lock
+; X32:       orb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor8() nounwind {
+; X64:   atomic_fetch_xor8
+; X32:   atomic_fetch_xor8
+  %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       xorb $3
+; X32:       lock
+; X32:       xorb $3
+  %t2 = atomicrmw xor  i8* @sc8, i8 5 acquire
+; X64:       xorb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       xorb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw xor  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       xorb
+; X32:       lock
+; X32:       xorb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand8(i8 %x) nounwind {
+; X64:   atomic_fetch_nand8
+; X32:   atomic_fetch_nand8
+  %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
+; X64:       andb
+; X64:       notb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       notb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max8(i8 %x) nounwind {
+  %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min8(i8 %x) nounwind {
+  %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax8(i8 %x) nounwind {
+  %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin8(i8 %x) nounwind {
+  %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg8() nounwind {
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store8(i8 %x) nounwind {
+  store atomic i8 %x, i8* @sc8 release, align 4
+; X64-NOT:   lock
+; X64:       movb
+; X32-NOT:   lock
+; X32:       movb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap8(i8 %x) nounwind {
+  %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
+; X64-NOT:   lock
+; X64:       xchgb
+; X32-NOT:   lock
+; X32:       xchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index 1fce256a8a24a..d94499889de41 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -178,7 +178,8 @@ entry:
 define void @sub2(i16* nocapture %p, i32 %v) nounwind ssp {
 entry:
 ; CHECK: sub2:
-; CHECK: negl
+; CHECK-NOT: negl
+; CHECK: subw
 	%0 = trunc i32 %v to i16		; <i16> [#uses=1]
   %1 = atomicrmw sub i16* %p, i16 %0 monotonic
   ret void
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 152bece4240fe..c5fa07d07d802 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -107,13 +107,12 @@ entry:
         ; CHECK: cmpxchgl
   %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
 	store i32 %17, i32* %old
+        ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
-        ; CHECK: movl	[[R17atomic:.*]], %eax
-        ; CHECK: movl	%eax, %[[R17newval:[a-z]*]]
-        ; CHECK: andl	%[[R17mask]], %[[R17newval]]
-        ; CHECK: notl	%[[R17newval]]
+        ; CHECK: andl	%eax, %[[R17mask]]
+        ; CHECK: notl	%[[R17mask]]
         ; CHECK: lock
-        ; CHECK: cmpxchgl	%[[R17newval]], [[R17atomic]]
+        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
         ; CHECK: jne
         ; CHECK: movl	%eax,
   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa82b58f5..95854c7960e71 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -109,8 +109,8 @@ allocas:
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
new file mode 100644
index 0000000000000..1446b36a0fb42
--- /dev/null
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c44beb4bc2b8e..88ecd5a5d34f9 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1140,9 +1140,9 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-  ; CHECK: movl
-  ; CHECK: movl
-  ; CHECK: vpcmpestri
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1150,6 +1150,18 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
@@ -1216,8 +1228,19 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistri
+  ; CHECK: vpcmpistri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1225,6 +1248,16 @@ define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
   ; CHECK: seta
@@ -1271,7 +1304,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistrm
+  ; CHECK: vpcmpistrm $7
   ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
@@ -1279,6 +1312,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vaddss
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709a3b1ba..ec11654b35560 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6caed36ab..ff56a454996e8 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa27426e..a414e6880c325 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,37 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
diff --git a/test/CodeGen/X86/bitcast-i256.ll b/test/CodeGen/X86/bitcast-i256.ll
new file mode 100644
index 0000000000000..85ac2fed6faac
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-i256.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i < %s | FileCheck %s --check-prefix CHECK
+
+define i256 @foo(<8 x i32> %a) {
+  %r = bitcast <8 x i32> %a to i256
+  ret i256 %r
+; CHECK: foo
+; CHECK: vextractf128
+; CHECK: vpextrq
+; CHECK: vpextrq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9bc533f..09eb5d1038f75 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@ define i32 @bax(<2 x i64> %c) {
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
new file mode 100644
index 0000000000000..3fb69a48b3c76
--- /dev/null
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
+  %t0 = fptoui <3 x float> %in to <3 x i8>
+  %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
+  store <4 x i8> %t2, <4 x i8>* %out, align 4
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pinsrd
+; CHECK-NEXT: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/cmov-fp.ll b/test/CodeGen/X86/cmov-fp.ll
new file mode 100644
index 0000000000000..ca91f9ea2c2b1
--- /dev/null
+++ b/test/CodeGen/X86/cmov-fp.ll
@@ -0,0 +1,451 @@
+; RUN: llc -march x86 -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march x86 -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
+; RUN: llc -march x86 -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
+; RUN: llc -march x86 -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
+; PR14035
+
+define double @test1(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test1:
+; SSE: movsd
+
+; NOSSE2: test1:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test1:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test1:
+; NOCMOV: fstp
+
+}
+
+define double @test2(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test2:
+; SSE: movsd
+
+; NOSSE2: test2:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test2:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test2:
+; NOCMOV: fstp
+}
+
+define double @test3(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test3:
+; SSE: movsd
+
+; NOSSE2: test3:
+; NOSSE2: fcmovb
+
+; NOSSE1: test3:
+; NOSSE1: fcmovb
+
+; NOCMOV: test3:
+; NOCMOV: fstp
+}
+
+define double @test4(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test4:
+; SSE: movsd
+
+; NOSSE2: test4:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test4:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test4:
+; NOCMOV: fstp
+}
+
+define double @test5(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test5:
+; SSE: movsd
+
+; NOSSE2: test5:
+; NOSSE2: fstp
+
+; NOSSE1: test5:
+; NOSSE1: fstp
+
+; NOCMOV: test5:
+; NOCMOV: fstp
+}
+
+define double @test6(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test6:
+; SSE: movsd
+
+; NOSSE2: test6:
+; NOSSE2: fstp
+
+; NOSSE1: test6:
+; NOSSE1: fstp
+
+; NOCMOV: test6:
+; NOCMOV: fstp
+}
+
+define double @test7(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test7:
+; SSE: movsd
+
+; NOSSE2: test7:
+; NOSSE2: fstp
+
+; NOSSE1: test7:
+; NOSSE1: fstp
+
+; NOCMOV: test7:
+; NOCMOV: fstp
+}
+
+define double @test8(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test8:
+; SSE: movsd
+
+; NOSSE2: test8:
+; NOSSE2: fstp
+
+; NOSSE1: test8:
+; NOSSE1: fstp
+
+; NOCMOV: test8:
+; NOCMOV: fstp
+}
+
+define float @test9(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test9:
+; SSE: movss
+
+; NOSSE2: test9:
+; NOSSE2: movss
+
+; NOSSE1: test9:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test9:
+; NOCMOV: fstp
+}
+
+define float @test10(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test10:
+; SSE: movss
+
+; NOSSE2: test10:
+; NOSSE2: movss
+
+; NOSSE1: test10:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test10:
+; NOCMOV: fstp
+}
+
+define float @test11(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test11:
+; SSE: movss
+
+; NOSSE2: test11:
+; NOSSE2: movss
+
+; NOSSE1: test11:
+; NOSSE1: fcmovb
+
+; NOCMOV: test11:
+; NOCMOV: fstp
+}
+
+define float @test12(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test12:
+; SSE: movss
+
+; NOSSE2: test12:
+; NOSSE2: movss
+
+; NOSSE1: test12:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test12:
+; NOCMOV: fstp
+}
+
+define float @test13(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test13:
+; SSE: movss
+
+; NOSSE2: test13:
+; NOSSE2: movss
+
+; NOSSE1: test13:
+; NOSSE1: fstp
+
+; NOCMOV: test13:
+; NOCMOV: fstp
+}
+
+define float @test14(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test14:
+; SSE: movss
+
+; NOSSE2: test14:
+; NOSSE2: movss
+
+; NOSSE1: test14:
+; NOSSE1: fstp
+
+; NOCMOV: test14:
+; NOCMOV: fstp
+}
+
+define float @test15(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test15:
+; SSE: movss
+
+; NOSSE2: test15:
+; NOSSE2: movss
+
+; NOSSE1: test15:
+; NOSSE1: fstp
+
+; NOCMOV: test15:
+; NOCMOV: fstp
+}
+
+define float @test16(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test16:
+; SSE: movss
+
+; NOSSE2: test16:
+; NOSSE2: movss
+
+; NOSSE1: test16:
+; NOSSE1: fstp
+
+; NOCMOV: test16:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test17:
+; SSE: fcmovnbe
+
+; NOSSE2: test17:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test17:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test17:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test18:
+; SSE: fcmovnb
+
+; NOSSE2: test18:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test18:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test18:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test19:
+; SSE: fcmovb
+
+; NOSSE2: test19:
+; NOSSE2: fcmovb
+
+; NOSSE1: test19:
+; NOSSE1: fcmovb
+
+; NOCMOV: test19:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test20:
+; SSE: fcmovbe
+
+; NOSSE2: test20:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test20:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test20:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; We don't emit a branch for fp80, why?
+; SSE: test21:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test21:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test21:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test21:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test22:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test22:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test22:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test22:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test23:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test23:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test23:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test23:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test24:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test24:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test24:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test24:
+; NOCMOV: fstp
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 9badfc82e99cb..276d0db9a4f3a 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -442,3 +442,150 @@ entry:
   ret void
 }
 declare void @_Z6PrintFz(...)
+
+@a = external global i32, align 4
+@fn1.g = private unnamed_addr constant [9 x i32*] [i32* null, i32* @a, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null], align 16
+@e = external global i32, align 4
+
+define void @pr13943() nounwind uwtable ssp {
+entry:
+  %srcval = load i576* bitcast ([9 x i32*]* @fn1.g to i576*), align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %g.0 = phi i576 [ %srcval, %entry ], [ %ins, %for.inc ]
+  %0 = load i32* @e, align 4
+  %1 = lshr i576 %g.0, 64
+  %2 = trunc i576 %1 to i64
+  %3 = inttoptr i64 %2 to i32*
+  %cmp = icmp eq i32* undef, %3
+  %conv2 = zext i1 %cmp to i32
+  %and = and i32 %conv2, %0
+  tail call void (...)* @fn3(i32 %and) nounwind
+  %tobool = icmp eq i32 undef, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.cond
+  ret void
+
+for.inc:                                          ; preds = %for.cond
+  %4 = shl i576 %1, 384
+  %mask = and i576 %g.0, -726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307841
+  %5 = and i576 %4, 726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307840
+  %ins = or i576 %5, %mask
+  br label %for.cond
+}
+
+declare void @fn3(...)
+
+; Check coalescing of IMPLICIT_DEF instructions:
+;
+; %vreg1 = IMPLICIT_DEF
+; %vreg2 = MOV32r0
+;
+; When coalescing %vreg1 and %vreg2, the IMPLICIT_DEF instruction should be
+; erased along with its value number.
+;
+define void @rdar12474033() nounwind ssp {
+bb:
+  br i1 undef, label %bb21, label %bb1
+
+bb1:                                              ; preds = %bb
+  switch i32 undef, label %bb10 [
+    i32 4, label %bb2
+    i32 1, label %bb9
+    i32 5, label %bb3
+    i32 6, label %bb3
+    i32 2, label %bb9
+  ]
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+bb3:                                              ; preds = %bb1, %bb1
+  br i1 undef, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+bb5:                                              ; preds = %bb3
+  %tmp = load <4 x float>* undef, align 1
+  %tmp6 = bitcast <4 x float> %tmp to i128
+  %tmp7 = load <4 x float>* undef, align 1
+  %tmp8 = bitcast <4 x float> %tmp7 to i128
+  br label %bb10
+
+bb9:                                              ; preds = %bb1, %bb1
+  unreachable
+
+bb10:                                             ; preds = %bb5, %bb1
+  %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
+  %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
+  switch i32 undef, label %bb21 [
+    i32 2, label %bb18
+    i32 3, label %bb13
+    i32 5, label %bb16
+    i32 6, label %bb17
+    i32 1, label %bb18
+  ]
+
+bb13:                                             ; preds = %bb10
+  br i1 undef, label %bb15, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br label %bb21
+
+bb15:                                             ; preds = %bb13
+  unreachable
+
+bb16:                                             ; preds = %bb10
+  unreachable
+
+bb17:                                             ; preds = %bb10
+  unreachable
+
+bb18:                                             ; preds = %bb10, %bb10
+  %tmp19 = bitcast i128 %tmp11 to <4 x float>
+  %tmp20 = bitcast i128 %tmp12 to <4 x float>
+  br label %bb21
+
+bb21:                                             ; preds = %bb18, %bb14, %bb10, %bb
+  %tmp22 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp20, %bb18 ]
+  %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
+  store <4 x float> %tmp23, <4 x float>* undef, align 16
+  store <4 x float> %tmp22, <4 x float>* undef, align 16
+  switch i32 undef, label %bb29 [
+    i32 5, label %bb27
+    i32 1, label %bb24
+    i32 2, label %bb25
+    i32 14, label %bb28
+    i32 4, label %bb26
+  ]
+
+bb24:                                             ; preds = %bb21
+  unreachable
+
+bb25:                                             ; preds = %bb21
+  br label %bb29
+
+bb26:                                             ; preds = %bb21
+  br label %bb29
+
+bb27:                                             ; preds = %bb21
+  unreachable
+
+bb28:                                             ; preds = %bb21
+  br label %bb29
+
+bb29:                                             ; preds = %bb28, %bb26, %bb25, %bb21
+  unreachable
+}
+
+define void @pr14194() nounwind uwtable {
+  %tmp = load i64* undef, align 16
+  %tmp1 = trunc i64 %tmp to i32
+  %tmp2 = lshr i64 %tmp, 32
+  %tmp3 = trunc i64 %tmp2 to i32
+  %tmp4 = call { i32, i32 } asm sideeffect "", "=&r,=&r,r,r,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %tmp3, i32 undef, i32 %tmp3, i32 %tmp1) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
new file mode 100644
index 0000000000000..466b09606786f
--- /dev/null
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+
+define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
+  %t1 = uitofp i32 %x to float
+  %t2 = insertelement <2 x float> undef, float %t1, i32 0
+  %t3 = uitofp i32 %y to float
+  %t4 = insertelement <2 x float> %t2, float %t3, i32 1
+  %t5 = fmul <2 x float> %v, %t4
+  ret <2 x float> %t5
+; CHECK: foo
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
+
+define <2 x float> @bar(<2 x i32> %in) {
+  %r = uitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %r
+; CHECK: bar
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/early-ifcvt-crash.ll b/test/CodeGen/X86/early-ifcvt-crash.ll
new file mode 100644
index 0000000000000..c8280269689d7
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt-crash.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -x86-early-ifcvt -verify-machineinstrs
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt -verify-machineinstrs
+;
+; Run these tests with and without -stress-early-ifcvt to exercise heuristics.
+;
+target triple = "x86_64-apple-macosx10.8.0"
+
+; MachineTraceMetrics::Ensemble::addLiveIns crashes because the first operand
+; on an inline asm instruction is not a vreg def.
+; <rdar://problem/12472811>
+define void @f1() nounwind {
+entry:
+  br i1 undef, label %if.then6.i, label %if.end.i
+
+if.then6.i:
+  br label %if.end.i
+
+if.end.i:
+  br i1 undef, label %if.end25.i, label %if.else17.i
+
+if.else17.i:
+  %shl24.i = shl i32 undef, undef
+  br label %if.end25.i
+
+if.end25.i:
+  %storemerge31.i = phi i32 [ %shl24.i, %if.else17.i ], [ 0, %if.end.i ]
+  store i32 %storemerge31.i, i32* undef, align 4
+  %0 = tail call i32 asm sideeffect "", "=r,r,i,i"(i32 undef, i32 15, i32 1) nounwind
+  %conv = trunc i32 %0 to i8
+  store i8 %conv, i8* undef, align 1
+  unreachable
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 7883ffabd5659..2e1852d3e3aef 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt | FileCheck %s
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: mm2
@@ -67,3 +67,78 @@ if.end41:
 }
 
 declare void @fprintf(...) nounwind
+
+; CHECK: BZ2_decompress
+; This test case contains irreducible control flow, so MachineLoopInfo doesn't
+; recognize the cycle in the CFG. This would confuse MachineTraceMetrics.
+define void @BZ2_decompress(i8* %s) nounwind ssp {
+entry:
+  switch i32 undef, label %sw.default [
+    i32 39, label %if.end.sw.bb2050_crit_edge
+    i32 36, label %sw.bb1788
+    i32 37, label %if.end.sw.bb1855_crit_edge
+    i32 40, label %sw.bb2409
+    i32 38, label %sw.bb1983
+    i32 44, label %if.end.sw.bb3058_crit_edge
+  ]
+
+if.end.sw.bb3058_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb1855_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb2050_crit_edge:                       ; preds = %entry
+  br label %sw.bb2050
+
+sw.bb1788:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.bb1983:                                        ; preds = %entry
+  br i1 undef, label %save_state_and_return, label %if.then1990
+
+if.then1990:                                      ; preds = %sw.bb1983
+  br label %while.body2038
+
+while.body2038:                                   ; preds = %sw.bb2050, %if.then1990
+  %groupPos.8 = phi i32 [ 0, %if.then1990 ], [ %groupPos.9, %sw.bb2050 ]
+  br i1 undef, label %save_state_and_return, label %if.end2042
+
+if.end2042:                                       ; preds = %while.body2038
+  br i1 undef, label %if.end2048, label %while.end2104
+
+if.end2048:                                       ; preds = %if.end2042
+  %bsLive2054.pre = getelementptr inbounds i8* %s, i32 8
+  br label %sw.bb2050
+
+sw.bb2050:                                        ; preds = %if.end2048, %if.end.sw.bb2050_crit_edge
+  %groupPos.9 = phi i32 [ 0, %if.end.sw.bb2050_crit_edge ], [ %groupPos.8, %if.end2048 ]
+  %and2064 = and i32 undef, 1
+  br label %while.body2038
+
+while.end2104:                                    ; preds = %if.end2042
+  br i1 undef, label %save_state_and_return, label %if.end2117
+
+if.end2117:                                       ; preds = %while.end2104
+  br i1 undef, label %while.body2161.lr.ph, label %while.body2145.lr.ph
+
+while.body2145.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+while.body2161.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+sw.bb2409:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.default:                                       ; preds = %entry
+  call void @BZ2_bz__AssertH__fail() nounwind
+  br label %save_state_and_return
+
+save_state_and_return:
+  %groupPos.14 = phi i32 [ 0, %sw.default ], [ %groupPos.8, %while.body2038 ], [ %groupPos.8, %while.end2104 ], [ 0, %if.end.sw.bb3058_crit_edge ], [ 0, %if.end.sw.bb1855_crit_edge ], [ %groupPos.8, %while.body2161.lr.ph ], [ %groupPos.8, %while.body2145.lr.ph ], [ 0, %sw.bb2409 ], [ 0, %sw.bb1788 ], [ 0, %sw.bb1983 ]
+  store i32 %groupPos.14, i32* undef, align 4
+  ret void
+}
+
+declare void @BZ2_bz__AssertH__fail()
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
new file mode 100644
index 0000000000000..704309eb65072
--- /dev/null
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<4 x float> %in, <4 x i8>* %out) {
+  %t0 = fptosi <4 x float> %in to <4 x i32>
+  %t1 = trunc <4 x i32> %t0 to <4 x i16>
+  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t3 = trunc <8 x i16> %t2 to <8 x i8>
+  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
+  store <4 x i8> %t5, <4 x i8>* %out
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/fast-cc-callee-pops.ll b/test/CodeGen/X86/fast-cc-callee-pops.ll
index ea10897c73574..2c5b80ac4af07 100644
--- a/test/CodeGen/X86/fast-cc-callee-pops.ll
+++ b/test/CodeGen/X86/fast-cc-callee-pops.ll
@@ -2,12 +2,12 @@
 
 ; Check that a fastcc function pops its stack variables before returning.
 
-define x86_fastcallcc void @func(i64 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_fastcallcc void @func(i64 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
 
-define x86_thiscallcc void @func2(i32 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_thiscallcc void @func2(i32 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index 14cb136f89de6..d591f9408b140 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -3,7 +3,7 @@
 
 target triple = "i686-pc-linux-gnu"
 
-declare x86_fastcallcc void @func(i32*, i64)
+declare x86_fastcallcc void @func(i32*, i64 inreg)
 
 define x86_fastcallcc void @caller(i32, i64) {
         %X = alloca i32         ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
index a96e5043fed4e..b60b68bd388da 100644
--- a/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
 ; check that fastcc is passing stuff in regs.
 
-declare x86_fastcallcc i64 @callee(i64)
+declare x86_fastcallcc i64 @callee(i64 inreg)
 
 define i64 @caller() {
         %X = call x86_fastcallcc  i64 @callee( i64 4294967299 )          ; <i64> [#uses=1]
@@ -9,7 +9,7 @@ define i64 @caller() {
         ret i64 %X
 }
 
-define x86_fastcallcc i64 @caller2(i64 %X) {
+define x86_fastcallcc i64 @caller2(i64 inreg %X) {
         ret i64 %X
 ; CHECK: mov{{.*}}EAX, ECX
 }
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663c94e6e..cdfaf7f4c1348 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
@@ -285,3 +291,16 @@ entry:
 }
 
 declare void @foo22(i32)
+
+; PR13563
+define void @test23(i8* noalias sret %result) {
+  %a = alloca i8
+  %b = call i8* @foo23()
+  ret void
+; CHECK: test23:
+; CHECK: call
+; CHECK: movq  %rdi, %rax
+; CHECK: ret
+}
+
+declare i8* @foo23()
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a0dd1c5..bd3514cc3f732 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e09d75b3..e3910a6935c45 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b346e2b2..2fe1ecd40e0cc 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87b3bbfe..6d98d59b3822b 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index c961f7576f937..d8366654c01ce 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -57,13 +57,13 @@ entry:
   %0 = load i32* %P, align 4
   %1 = load i32* %Q, align 4
   %2 = xor i32 %0, %1
-  %3 = and i32 %2, 65535
+  %3 = and i32 %2, 89947
   %4 = icmp eq i32 %3, 0
   br i1 %4, label %exit, label %land.end
 
 exit:
   %shr.i.i19 = xor i32 %1, %0
-  %5 = and i32 %shr.i.i19, 2147418112
+  %5 = and i32 %shr.i.i19, 3456789123
   %6 = icmp eq i32 %5, 0
   br label %land.end
 
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000000000..d70aa7d79f009
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test4
+define float @test4(float %a) {
+; CHECK-NOT: fma
+; CHECK-NOT mul
+; CHECK-NOT: add
+; CHECK: ret
+  %t1 = fmul float %a, 0.0
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %a) {
+; CHECK-NOT: add
+; CHECK: vxorps
+; CHECK: ret
+  %t1 = fsub float -0.0, %a
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
new file mode 100644
index 0000000000000..2ae65c97d97ac
--- /dev/null
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
+
+define <1 x float> @test1(<1 x double>* %p) nounwind {
+; CHECK: test1
+; CHECK: cvtsd2ss
+; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
+  %x = load <1 x double>* %p
+  %y = fptrunc <1 x double> %x to <1 x float>
+  ret <1 x float> %y
+}
+
+define <2 x float> @test2(<2 x double>* %p) nounwind {
+; CHECK: test2
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: ret
+; AVX:   test2
+; AVX:   vcvtpd2psx {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <2 x double>* %p
+  %y = fptrunc <2 x double> %x to <2 x float>
+  ret <2 x float> %y
+}
+
+define <4 x float> @test3(<4 x double>* %p) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <4 x double>* %p
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double>* %p) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vinsertf128
+; AVX:   ret
+  %x = load <8 x double>* %p
+  %y = fptrunc <8 x double> %x to <8 x float>
+  ret <8 x float> %y
+}
+
+
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 170637a40ee23..25442fcadd238 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -1,33 +1,56 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
+; CHECK: test1
 ; CHECK: cvtsd2ss
 ; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
-
 define <2 x float> @test2(<2 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+; CHECK: test2
+; CHECK: cvtpd2ps
 ; CHECK: ret
+; AVX:   test2
+; AVX-NOT:  vcvtpd2psy
+; AVX:   vcvtpd2ps
+; AVX:   ret
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
-define <8 x float> @test3(<8 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a series of cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+define <4 x float> @test3(<4 x double> %x) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy
+; AVX:   ret
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double> %x) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
 ; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy
+; AVX:   vcvtpd2psy
+; AVX:   vinsertf128
+; AVX:   ret
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
 }
diff --git a/test/CodeGen/X86/handle-move.ll b/test/CodeGen/X86/handle-move.ll
new file mode 100644
index 0000000000000..e9f7a962e20d1
--- /dev/null
+++ b/test/CodeGen/X86/handle-move.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
+; REQUIRES: asserts
+;
+; Test the LiveIntervals::handleMove() function.
+;
+; Moving the DIV32r instruction exercises the regunit update code because
+; %EDX has a live range into the function and is used by the DIV32r.
+;
+; Here sinking a kill + dead def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def>, %EDX<imp-def,dead>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = add i32 %c, 1
+  %x = udiv i32 %b, %a
+  %add = add nsw i32 %y, %x
+  ret i32 %add
+}
+
+; Same as above, but moving a kill + live def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def,dead>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %c, %d
+  %x = urem i32 %b, %a
+  %add = add nsw i32 %x, %y
+  ret i32 %add
+}
+
+; Moving a use below the existing kill (%vreg5):
+; Moving a tied virtual register def (%vreg11):
+;
+; 96B -> 120B: %vreg11<def,tied1> = SUB32rr %vreg11<tied0>, %vreg5
+;       %vreg11:        [80r,96r:1)[96r,144r:0)  0@96r 1@80r
+;            -->        [80r,120r:1)[120r,144r:0)  0@120r 1@80r
+;       %vreg5:         [16r,112r:0)  0@16r
+;            -->        [16r,120r:0)  0@16r
+;
+define i32 @f3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %a, %b
+  %x = add i32 %a, %b
+  %r = mul i32 %x, %y
+  ret i32 %r
+}
+
+; Move EFLAGS dead def across another def:
+; handleMove 208B -> 36B: %EDX<def> = MOV32r0 %EFLAGS<imp-def,dead>
+;    EFLAGS:    [20r,20d:4)[160r,160d:3)[208r,208d:0)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@208r 1@224r 2@272r 3@160r 4@20r 5@304r
+;         -->   [20r,20d:4)[36r,36d:0)[160r,160d:3)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@36r 1@224r 2@272r 3@160r 4@20r 5@304r
+;
+define i32 @f4(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %x = sub i32 %a, %b
+  %y = sub i32 %b, %c
+  %z = sub i32 %c, %d
+  %r1 = udiv i32 %x, %y
+  %r2 = mul i32 %z, %r1
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb09ec29..597236e36281e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index e6eb9efd8c781..d201ebdc85d13 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -52,3 +52,10 @@ entry:
   %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
   ret void
 }
+
+; Mix normal and EC defs of the same register.
+define i32 @pr14376() nounwind noinline {
+entry:
+  %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
+  ret i32 %asm
+}
diff --git a/test/CodeGen/X86/inlineasm-sched-bug.ll b/test/CodeGen/X86/inlineasm-sched-bug.ll
new file mode 100644
index 0000000000000..08de0c02d2936
--- /dev/null
+++ b/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -0,0 +1,13 @@
+; PR13504
+; RUN: llc -march=x86 -mcpu=atom <%s | FileCheck %s
+; CHECK: bsfl
+; CHECK-NOT: movl
+
+define i32 @foo(i32 %treemap) nounwind uwtable {
+entry:
+  %sub = sub i32 0, %treemap
+  %and = and i32 %treemap, %sub
+  %0 = tail call i32 asm "bsfl $1,$0\0A\09", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %and) nounwind
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 48e21061d2099..0e34222b945f2 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro -verify-machineinstrs | FileCheck %s
 
 define i32 @f(i32 %X) {
 entry:
@@ -219,7 +219,6 @@ entry:
 ; by sbb, we should not optimize cmp away.
 define i32 @q(i32 %j.4, i32 %w, i32 %el) {
 ; CHECK: q:
-; CHECK: sub
 ; CHECK: cmp
 ; CHECK-NEXT: sbb
   %tmp532 = add i32 %j.4, %w
@@ -253,3 +252,56 @@ return:
   %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
   ret i8* %retval.0
 }
+
+; Test optimizations of dec/inc.
+define i32 @dec(i32 %a) nounwind {
+entry:
+; CHECK: dec:
+; CHECK: decl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %sub = sub nsw i32 %a, 1
+  %cmp = icmp sgt i32 %sub, 0
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+
+define i32 @inc(i32 %a) nounwind {
+entry:
+; CHECK: inc:
+; CHECK: incl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %add = add nsw i32 %a, 1
+  %cmp = icmp sgt i32 %add, 0
+  %cond = select i1 %cmp, i32 %add, i32 0
+  ret i32 %cond
+}
+
+; PR13966
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+define i32 @test1(i32 %p1) nounwind uwtable {
+entry:
+; CHECK: test1:
+; CHECK: testb
+; CHECK: j
+; CHECK: ret
+  %0 = load i32* @b, align 4
+  %cmp = icmp ult i32 %0, %p1
+  %conv = zext i1 %cmp to i32
+  %1 = load i32* @a, align 4
+  %and = and i32 %conv, %1
+  %conv1 = trunc i32 %and to i8
+  %2 = urem i8 %conv1, 3
+  %tobool = icmp eq i8 %2, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @a, align 4
+  br label %if.end
+
+if.end:
+  ret i32 undef
+}
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 0000000000000..2184d9e960363
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/X86/misched-ilp.ll b/test/CodeGen/X86/misched-ilp.ll
new file mode 100644
index 0000000000000..c6cedb7be8714
--- /dev/null
+++ b/test/CodeGen/X86/misched-ilp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
+;
+; Basic verification of the ScheduleDAGILP metric.
+;
+; MAX: addss
+; MAX: addss
+; MAX: addss
+; MAX: subss
+; MAX: addss
+;
+; MIN: addss
+; MIN: addss
+; MIN: subss
+; MIN: addss
+; MIN: addss
+define float @ilpsched(float %a, float %b, float %c, float %d, float %e, float %f) nounwind uwtable readnone ssp {
+entry:
+  %add = fadd float %a, %b
+  %add1 = fadd float %c, %d
+  %add2 = fadd float %e, %f
+  %add3 = fsub float %add1, %add2
+  %add4 = fadd float %add, %add3
+  ret float %add4
+}
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index 8f2f6f7697dfa..cec04b534fba3 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -enable-misched -misched=shuffle -misched-bottomup < %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
+; RUN:     | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Interesting MachineScheduler cases.
@@ -25,3 +27,27 @@ for.cond.preheader:                               ; preds = %entry
 if.end:                                           ; preds = %entry
   ret void
 }
+
+; The machine verifier checks that EFLAGS kill flags are updated when
+; the scheduler reorders cmovel instructions.
+;
+; CHECK: test
+; CHECK: cmovel
+; CHECK: cmovel
+; CHECK: call
+define void @foo(i32 %b) nounwind uwtable ssp {
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %v1 = phi i32 [1, %entry], [2, %if.then]
+  %v2 = phi i32 [3, %entry], [4, %if.then]
+  call void @bar(i32 %v1, i32 %v2)
+  ret void
+}
+
+declare void @bar(i32,i32)
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index 8b7200d2f78fd..a8d33f43da013 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1043,6 +1043,20 @@ entry:
   ret i64 %5
 }
 
+define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: test21_2
+; CHECK: pshufw
+; CHECK: movd
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 0000000000000..24d28adda8949
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t2() nounwind {
+entry:
+  call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  ret void
+; CHECK: t2
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, 1
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t3(i32 %V) nounwind {
+entry:
+  %V.addr = alloca i32, align 4
+  store i32 %V, i32* %V.addr, align 4
+  call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
+  ret void
+; CHECK: t3
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, DWORD PTR {{[[esp]}}
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+%struct.t18_type = type { i32, i32 }
+
+define i32 @t18() nounwind {
+entry:
+  %foo = alloca %struct.t18_type, align 4
+  %a = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  store i32 2, i32* %b, align 4
+  call void asm sideeffect inteldialect "lea ebx, foo\0A\09mov eax, [ebx].0\0A\09mov [ebx].4, ecx", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %b1 = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  %0 = load i32* %b1, align 4
+  ret i32 %0
+; CHECK: t18
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: lea ebx, foo
+; CHECK: mov eax, [ebx].0
+; CHECK: mov [ebx].4, ecx
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
new file mode 100644
index 0000000000000..b75ac009e76d4
--- /dev/null
+++ b/test/CodeGen/X86/mulx32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86 < %s | FileCheck %s
+
+define i64 @f1(i32 %a, i32 %b) {
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f1
+; CHECK: mulxl
+; CHECK: ret
+  ret i64 %r
+}
+
+define i64 @f2(i32 %a, i32* %p) {
+  %b = load i32* %p
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f2
+; CHECK: mulxl ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i64 %r
+}
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
new file mode 100644
index 0000000000000..d5730282a1378
--- /dev/null
+++ b/test/CodeGen/X86/mulx64.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86-64 < %s | FileCheck %s
+
+define i128 @f1(i64 %a, i64 %b) {
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f1
+; CHECK: mulxq
+; CHECK: ret
+  ret i128 %r
+}
+
+define i128 @f2(i64 %a, i64* %p) {
+  %b = load i64* %p
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f2
+; CHECK: mulxq ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i128 %r
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e57e0c62..2a20e7ad6f15c 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index 8c16dc68b291d..bdd8859358426 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -relocation-model=pic -mtriple=i386-linux-gnu -asm-verbose=false \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -relocation-model=pic -mark-data-regions -mtriple=i686-apple-darwin -asm-verbose=false \
+; RUN:   | FileCheck %s --check-prefix=CHECK-DATA
 ; RUN: llc < %s -relocation-model=pic -mtriple=i686-apple-darwin -asm-verbose=false \
 ; RUN:   | FileCheck %s
 ; RUN: llc < %s                       -mtriple=x86_64-apple-darwin | not grep 'lJTI'
@@ -16,6 +18,16 @@ entry:
 ; CHECK:       Ltmp0 = LJTI0_0-L0$pb
 ; CHECK-NEXT:  addl Ltmp0(%eax,%ecx,4)
 ; CHECK-NEXT:  jmpl *%eax
+
+;; When data-in-code markers are enabled, we should see them around the jump
+;; table.
+; CHECK-DATA: .data_region jt32
+; CHECK-DATA: LJTI0_0
+; CHECK-DATA: .end_data_region
+
+;; When they're not enabled, make sure we don't see them at all.
+; CHECK-NOT: .data_region
+; CHECK-LINUX-NOT: .data_region
 	%Y_addr = alloca i32		; <i32*> [#uses=2]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
 	store i32 %Y, i32* %Y_addr
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
new file mode 100644
index 0000000000000..16e9c28fcdefe
--- /dev/null
+++ b/test/CodeGen/X86/pmovext.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; rdar://11897677
+
+;CHECK: intrin_pmov
+;CHECK: pmovzxbw  (%{{.*}}), %xmm0
+;CHECK-NEXT: movdqu
+;CHECK-NEXT: ret
+define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable ssp {
+  %1 = bitcast i8* %src to <2 x i64>*
+  %2 = load <2 x i64>* %1, align 16
+  %3 = bitcast <2 x i64> %2 to <16 x i8>
+  %4 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %3) nounwind
+  %5 = bitcast i16* %dest to i8*
+  %6 = bitcast <8 x i16> %4 to <16 x i8>
+  tail call void @llvm.x86.sse2.storeu.dq(i8* %5, <16 x i8> %6) nounwind
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 800fbedb4f99a..58423d1959644 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
 entry:
   %G = load <4 x i8>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: pand
   %K = inttoptr <4 x i8> %G to <4 x i32*>
 ;CHECK: ret
@@ -105,7 +104,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movsd
+;CHECK: pmovzxdq
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index 5b7b5eab87ece..e7e29e0d609cf 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -54,3 +54,11 @@ entry:
   %f1 = fpext <8 x float> %v1 to <8 x double>
   ret <8 x double> %f1
 }
+
+define void @test_vector_creation() nounwind {
+  %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2
+  %2 = load double addrspace(1)* null
+  %3 = insertelement <4 x double> %1, double %2, i32 3
+  store <4 x double> %3, <4 x double>* undef
+  ret void
+}
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
new file mode 100644
index 0000000000000..fa378502f724a
--- /dev/null
+++ b/test/CodeGen/X86/pr11985.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s
+
+define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* blockaddress(@foo, %out), i64 22, i32 1, i1 false)
+  br label %out
+
+out:                                              ; preds = %entry
+  %add = fadd float %a, %b
+  ret float %add
+; CHECK: foo
+; CHECK: movw .L{{.*}}+20(%rip), %{{.*}}
+; CHECK: movl .L{{.*}}+16(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}+8(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}(%rip), %{{.*}}
+; CHECK: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000000000..087b8d7539ec8
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000000000..024b163fa718b
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/pr13458.ll b/test/CodeGen/X86/pr13458.ll
new file mode 100644
index 0000000000000..55548b3c3b452
--- /dev/null
+++ b/test/CodeGen/X86/pr13458.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin11.4.2"
+
+%v8_uniform_Stats.0.2.4.10 = type { i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, [7 x i32], [7 x i64] }
+
+@globalStats = external global %v8_uniform_Stats.0.2.4.10
+
+define void @MergeStats() nounwind {
+allocas:
+  %r.i.i720 = atomicrmw max i64* getelementptr inbounds (%v8_uniform_Stats.0.2.4.10* @globalStats, i64 0, i32 30), i64 0 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13859.ll b/test/CodeGen/X86/pr13859.ll
new file mode 100644
index 0000000000000..719721dfd87bb
--- /dev/null
+++ b/test/CodeGen/X86/pr13859.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.7.0"
+
+define void @_Z17FilterYUVRows_MMXi(i32 %af) nounwind ssp {
+entry:
+  %aMyAlloca = alloca i32, align 32
+  %dest = alloca <1 x i64>, align 32
+
+  %a32 = load i32* %aMyAlloca, align 4
+  %aconv = trunc i32 %a32 to i16
+  %a36 = insertelement <4 x i16> undef, i16 %aconv, i32 0
+  %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
+  %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
+  %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
+  %a40 = bitcast <4 x i16> %a39 to x86_mmx
+  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+
+  %a47 = trunc i32 %a32 to i1
+  br i1 %a47, label %a48, label %a49
+
+a48:
+  unreachable
+
+a49:
+  store <1 x i64> %a41, <1 x i64>* %dest, align 8 ; !!!
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13899.ll b/test/CodeGen/X86/pr13899.ll
new file mode 100644
index 0000000000000..bc81e34d67e30
--- /dev/null
+++ b/test/CodeGen/X86/pr13899.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=X64
+
+; ModuleID = 'a.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i386-pc-win32"
+
+%v4_varying_big_struct = type { [4 x <4 x i32>] }
+
+declare <4 x i32> @"foo"(%v4_varying_big_struct, <4 x i32>) nounwind
+
+define <4 x i32> @"bar"(%v4_varying_big_struct %s, <4 x i32> %__mask) nounwind {
+allocas:
+  %calltmp = call <4 x i32> @"foo"(%v4_varying_big_struct %s, <4 x i32> %__mask)
+  ret <4 x i32> %calltmp
+; CHECK: bar
+; CHECK: andl
+; CHECK: call
+; CHECK: ret
+}
+
+declare <8 x float> @bar64(<8 x float> %i0, <8 x float> %i1,
+                         <8 x float> %i2, <8 x float> %i3,
+                         <8 x float> %i4, <8 x float> %i5,
+                         <8 x float> %i6, <8 x float> %i7,
+                         <8 x float> %i8, <8 x float> %i9)
+
+define <8 x float> @foo64(<8 x float>* %p) {
+  %1 = load <8 x float>* %p
+  %idx1 = getelementptr inbounds <8 x float>* %p, i64 1
+  %2 = load <8 x float>* %idx1
+  %idx2 = getelementptr inbounds <8 x float>* %p, i64 2
+  %3 = load <8 x float>* %idx2
+  %idx3 = getelementptr inbounds <8 x float>* %p, i64 3
+  %4 = load <8 x float>* %idx3
+  %idx4 = getelementptr inbounds <8 x float>* %p, i64 4
+  %5 = load <8 x float>* %idx4
+  %idx5 = getelementptr inbounds <8 x float>* %p, i64 5
+  %6 = load <8 x float>* %idx5
+  %idx6 = getelementptr inbounds <8 x float>* %p, i64 6
+  %7 = load <8 x float>* %idx6
+  %idx7 = getelementptr inbounds <8 x float>* %p, i64 7
+  %8 = load <8 x float>* %idx7
+  %idx8 = getelementptr inbounds <8 x float>* %p, i64 8
+  %9 = load <8 x float>* %idx8
+  %idx9 = getelementptr inbounds <8 x float>* %p, i64 9
+  %10 = load <8 x float>* %idx9
+  %r = tail call <8 x float> @bar64(<8 x float> %1, <8 x float> %2,
+                                    <8 x float> %3, <8 x float> %4,
+                                    <8 x float> %5, <8 x float> %6,
+                                    <8 x float> %7, <8 x float> %8,
+                                    <8 x float> %9, <8 x float> %10)
+  ret <8 x float> %r
+; X64: foo
+; X64: and
+; X64: call
+; X64: ret
+}
diff --git a/test/CodeGen/X86/pr14088.ll b/test/CodeGen/X86/pr14088.ll
new file mode 100644
index 0000000000000..505e3b5cf262a
--- /dev/null
+++ b/test/CodeGen/X86/pr14088.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple x86_64-linux -mcpu core2 -verify-machineinstrs %s -o - | FileCheck %s
+define i32 @f(i1 %foo, i16* %tm_year2, i8* %bar, i16 %zed, i32 %zed2) {
+entry:
+  br i1 %foo, label %return, label %if.end
+
+if.end:
+  %rem = srem i32 %zed2, 100
+  %conv3 = trunc i32 %rem to i16
+  store i16 %conv3, i16* %tm_year2
+  %sext = shl i32 %rem, 16
+  %conv5 = ashr exact i32 %sext, 16
+  %div = sdiv i32 %conv5, 10
+  %conv6 = trunc i32 %div to i8
+  store i8 %conv6, i8* %bar
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; We were miscompiling this and using %ax instead of %cx in the movw.
+; CHECK: movswl	%cx, %ecx
+; CHECK: movw	%cx, (%rsi)
+; CHECK: movslq	%ecx, %rcx
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
new file mode 100644
index 0000000000000..d76b912fd8e2b
--- /dev/null
+++ b/test/CodeGen/X86/pr14090.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
+
+define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
+entry:
+  %_Tmp.i39 = alloca i64, align 8
+  %retval.i33 = alloca i64, align 8
+  %_Tmp.i = alloca i64, align 8
+  %retval.i.i = alloca i64, align 8
+  %_First.i = alloca i64, align 8
+
+  %0 = load i64* %retval.i, align 8
+
+  %1 = load i64* %retval.i, align 8
+
+  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
+  store i64 %1, i64* %_Tmp.i39, align 8
+  %cmp.i.i.i40 = icmp slt i32 %call, 0
+  %2 = lshr i64 %1, 32
+  %3 = trunc i64 %2 to i32
+  %sub.i.i.i44 = sub i32 0, %call
+  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
+  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
+  %add.i.i.i47 = add i32 %3, %call
+  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
+  %trunc.i50 = trunc i64 %1 to i32
+  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
+  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
+  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
+  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
+  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
+  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
+  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
+  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
+  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
+  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
+  %srcval.i56 = load i64* %_Tmp.i39, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
+
+; CHECK: Before Merge disjoint stack slots
+; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
+; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
+
+; CHECK: After Merge disjoint stack slots
+; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
+; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
+
+  %fifteen = bitcast i64* %retval.i.i to i32**
+  %sixteen = bitcast i64* %retval.i.i to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
+  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  %sunkaddr = ptrtoint i64* %retval.i.i to i32
+  %sunkaddr86 = add i32 %sunkaddr, 4
+  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
+  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  %seventeen = load i64* %retval.i.i, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
+  %eighteen = lshr i64 %seventeen, 32
+  %nineteen = trunc i64 %eighteen to i32
+  %shl.i.i.i = shl i32 1, %nineteen
+
+  store i32 %shl.i.i.i, i32* %out.lo, align 8
+  store i32 %nineteen, i32* %out.hi, align 8
+
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr14098.ll b/test/CodeGen/X86/pr14098.ll
new file mode 100644
index 0000000000000..6ce2449ab6a63
--- /dev/null
+++ b/test/CodeGen/X86/pr14098.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple i386-unknown-linux-gnu -relocation-model=pic -verify-machineinstrs < %s
+; We used to crash on this.
+
+declare void @foo()
+declare void @foo3(i1 %x)
+define void @bar(i1 %a1, i16 %a2) nounwind align 2 {
+bb0:
+  %a3 = trunc i16 %a2 to i8
+  %a4 = lshr i16 %a2, 8
+  %a5 = trunc i16 %a4 to i8
+  br i1 %a1, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %a6 = phi i8 [ 3, %bb0 ], [ %a5, %bb1 ]
+  %a7 = phi i8 [ 9, %bb0 ], [ %a3, %bb1 ]
+  %a8 = icmp eq i8 %a6, 1
+  call void @foo()
+  %a9 = icmp eq i8 %a7, 0
+  call void @foo3(i1 %a9)
+  call void @foo3(i1 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
new file mode 100644
index 0000000000000..ff4532eac3ac5
--- /dev/null
+++ b/test/CodeGen/X86/pr14161.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=corei7 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
+
+define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %8
+; CHECK: good
+; CHECK: pminud
+; CHECK-NEXT: pmovzxwq
+; CHECK: ret
+}
+
+define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %9
+; CHECK: bad
+; CHECK: pminud
+; CHECK: pextrd
+; CHECK: pmovzxwq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14204.ll b/test/CodeGen/X86/pr14204.ll
new file mode 100644
index 0000000000000..42e362bf3b9b4
--- /dev/null
+++ b/test/CodeGen/X86/pr14204.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=core-avx2 | FileCheck %s
+
+; FIXME: vpmovsxwd should be generated instead of vpmovzxwd followed by
+; SLL/SRA.
+
+define <8 x i32> @foo(<8 x i1> %bar) nounwind readnone {
+entry:
+  %s = sext <8 x i1> %bar to <8 x i32>
+  ret <8 x i32> %s
+; CHECK: foo
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
new file mode 100644
index 0000000000000..5388a4b01b656
--- /dev/null
+++ b/test/CodeGen/X86/pr14314.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 | FileCheck %s
+
+define i64 @atomicSub(i64* %a, i64 %b) nounwind {
+entry:
+  %0 = atomicrmw sub i64* %a, i64 %b seq_cst
+  ret i64 %0
+; CHECK: atomicSub
+; movl %eax, %ebx
+; subl {{%[a-z]+}}, %ebx
+; movl %edx, %ecx
+; sbbl {{%[a-z]+}}, %ecx
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14333.ll b/test/CodeGen/X86/pr14333.ll
new file mode 100644
index 0000000000000..86c12ef6b5472
--- /dev/null
+++ b/test/CodeGen/X86/pr14333.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s
+%foo = type { i64, i64 }
+define void @bar(%foo* %zed) {
+  %tmp = getelementptr inbounds %foo* %zed, i64 0, i32 0
+  store i64 0, i64* %tmp, align 8
+  %tmp2 = getelementptr inbounds %foo* %zed, i64 0, i32 1
+  store i64 0, i64* %tmp2, align 8
+  %tmp3 = bitcast %foo* %zed to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp3, i8 0, i64 16, i32 8, i1 false)
+  ret void
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
new file mode 100644
index 0000000000000..d048db8a850d5
--- /dev/null
+++ b/test/CodeGen/X86/pr5145.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+@sc8 = external global i8
+
+define void @atomic_maxmin_i8() {
+; CHECK: atomic_maxmin_i8
+  %1 = atomicrmw max  i8* @sc8, i8 5 acquire
+; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovl
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL1]]
+  %2 = atomicrmw min  i8* @sc8, i8 6 acquire
+; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovg
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL3]]
+  %3 = atomicrmw umax i8* @sc8, i8 7 acquire
+; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovb
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL5]]
+  %4 = atomicrmw umin i8* @sc8, i8 8 acquire
+; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmova
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL7]]
+  ret void
+}
diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll
index 8b30dc718b088..283f48cd37b49 100644
--- a/test/CodeGen/X86/promote.ll
+++ b/test/CodeGen/X86/promote.ll
@@ -20,7 +20,7 @@ entry:
 ; CHECK: shuff_f
 define i32 @shuff_f(<4 x i8>* %A) {
 entry:
-; CHECK: pshufb
+; CHECK: pmovzxbd
 ; CHECK: paddd
 ; CHECK: pshufb
   %0 = load <4 x i8>* %A, align 8
diff --git a/test/CodeGen/X86/ptr-rotate.ll b/test/CodeGen/X86/ptr-rotate.ll
index 6debd16ba5dde..fbd13b5036447 100644
--- a/test/CodeGen/X86/ptr-rotate.ll
+++ b/test/CodeGen/X86/ptr-rotate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin -o - < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin -mcpu=corei7 -o - < %s | FileCheck %s
 
 define i32 @func(i8* %A) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/red-zone2.ll b/test/CodeGen/X86/red-zone2.ll
index f09216319e8da..3e9c7909a366e 100644
--- a/test/CodeGen/X86/red-zone2.ll
+++ b/test/CodeGen/X86/red-zone2.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: grep subq %t | count 1
-; RUN: grep addq %t | count 1
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; CHECK: f0:
+; CHECK: subq
+; CHECK: addq
 
 define x86_fp80 @f0(float %f) nounwind readnone noredzone {
 entry:
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 99602fd64ff59..e95a734e048d8 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
@@ -48,12 +49,25 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xfoo:
 ; CHECK: roll $7
+; BMI2: xfoo:
+; BMI2: rorxl $25
 	%0 = lshr i32 %x, 25
 	%1 = shl i32 %x, 7
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xfoop(i32* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxl $25, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 25
+	%b = shl i32 %x, 7
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbar(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbar:
@@ -68,12 +82,25 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xun:
 ; CHECK: roll $25
+; BMI2: xun:
+; BMI2: rorxl $7
 	%0 = lshr i32 %x, 7
 	%1 = shl i32 %x, 25
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xunp(i32* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxl $7, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 7
+	%b = shl i32 %x, 25
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbu:
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index 4e082bb860b45..7fa982d83b619 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,8 +1,9 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep rol %t | count 3
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 > %t
+; RUN: grep rol %t | count 5
 ; RUN: grep ror %t | count 1
 ; RUN: grep shld %t | count 2
 ; RUN: grep shrd %t | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
@@ -42,12 +43,25 @@ entry:
 
 define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xfoo:
+; BMI2: rorxq $57
 	%0 = lshr i64 %x, 57
 	%1 = shl i64 %x, 7
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xfoop(i64* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxq $57, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 57
+	%b = shl i64 %x, 7
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = shl i64 %y, 7
@@ -58,12 +72,25 @@ entry:
 
 define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xun:
+; BMI2: rorxq $7
 	%0 = lshr i64 %x, 7
 	%1 = shl i64 %x, 57
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xunp(i64* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxq $7, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 7
+	%b = shl i64 %x, 57
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = lshr i64 %y, 7
diff --git a/test/CodeGen/X86/rotate2.ll b/test/CodeGen/X86/rotate2.ll
index 2eea3999e7b8c..2316c708507a6 100644
--- a/test/CodeGen/X86/rotate2.ll
+++ b/test/CodeGen/X86/rotate2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep rol | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | grep rol | count 2
 
 define i64 @test1(i64 %x) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
new file mode 100644
index 0000000000000..76eb9514f02cc
--- /dev/null
+++ b/test/CodeGen/X86/rtm.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare i32 @llvm.x86.xbegin() nounwind
+declare void @llvm.x86.xend() nounwind
+declare void @llvm.x86.xabort(i8) noreturn nounwind
+
+define i32 @test_xbegin() nounwind uwtable {
+entry:
+  %0 = tail call i32 @llvm.x86.xbegin() nounwind
+  ret i32 %0
+; CHECK: test_xbegin
+; CHECK: xbegin [[LABEL:.*BB.*]]
+; CHECK: [[LABEL]]:
+}
+
+define void @test_xend() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xend() nounwind
+  ret void
+; CHECK: test_xend
+; CHECK: xend
+}
+
+define void @test_xabort() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xabort(i8 2)
+  unreachable
+; CHECK: test_xabort
+; CHECK: xabort $2
+}
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 2e39473057b12..3bec3acdbf765 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -344,3 +344,16 @@ entry:
 ; ATOM: negw
 ; ATOM: sbbw
 }
+
+define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
+  %cmp = icmp slt i32 %x, 15
+  %sel = select i1 %cmp, i8 %a, i8 %b
+  ret i8 %sel
+; CHECK: test18:
+; CHECK: cmpl $15, %edi
+; CHECK: cmovgel %edx
+
+; ATOM: test18:
+; ATOM: cmpl $15, %edi
+; ATOM: cmovgel %edx
+}
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
new file mode 100644
index 0000000000000..5b2409d2396f1
--- /dev/null
+++ b/test/CodeGen/X86/select_const.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7 | FileCheck %s
+
+define i64 @test1(i64 %x) nounwind {
+entry:
+  %cmp = icmp eq i64 %x, 2
+  %add = add i64 %x, 1
+  %retval.0 = select i1 %cmp, i64 2, i64 %add
+  ret i64 %retval.0
+
+; CHECK: test1:
+; CHECK: leaq 1(%rdi), %rax
+; CHECK: cmpq $2, %rdi
+; CHECK: cmoveq %rdi, %rax
+; CHECK: ret
+
+}
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
new file mode 100644
index 0000000000000..d1f321f17738e
--- /dev/null
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -0,0 +1,178 @@
+; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s
+
+define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32
+; BMI2: shlxl
+; BMI2: ret
+; BMI264: shl32
+; BMI264: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32i(i32 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, 5
+; BMI2: shl32i
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32i
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32p
+; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: shl32p
+; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32pi(i32* %p) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, 5
+; BMI2: shl32pi
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32pi
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64
+; BMI264: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64i(i64 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, 7
+; BMI264: shl64i
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64p
+; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, 7
+; BMI264: shl64p
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32
+; BMI2: shrxl
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32p
+; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64
+; BMI264: shrxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64p
+; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32
+; BMI2: sarxl
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32p
+; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64
+; BMI264: sarxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64p
+; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 1479be1f56ba5..734f48ae329f5 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,7 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=SAFE
 
 declare float  @sinf(float) readonly
 
@@ -17,6 +18,9 @@ define float @test1(float %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test1
+; SAFE-NOT: fsin
+
 ; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
@@ -26,6 +30,9 @@ define double @test2(double %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test2
+; SAFE-NOT: fsin
+
 ; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
@@ -50,12 +57,18 @@ define float @test4(float %X) {
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test4
+; SAFE-NOT: fcos
+
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test5
+; SAFE-NOT: fcos
+
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
diff --git a/test/CodeGen/X86/sjlj.ll b/test/CodeGen/X86/sjlj.ll
new file mode 100644
index 0000000000000..681db0094384a
--- /dev/null
+++ b/test/CodeGen/X86/sjlj.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC86 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC64 %s
+
+@buf = internal global [5 x i8*] zeroinitializer
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+declare i8* @llvm.stacksave() nounwind
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+declare void @llvm.eh.sjlj.longjmp(i8*) nounwind
+
+define i32 @sj0() nounwind {
+  %fp = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %fp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 0), align 16
+  %sp = tail call i8* @llvm.stacksave()
+  store i8* %sp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 2), align 16
+  %r = tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  ret i32 %r
+; X86: sj0
+; x86: movl %ebp, buf
+; X86: movl %esp, buf+8
+; x86: movl ${{.*LBB.*}}, buf+4
+; X86: ret
+; PIC86: sj0
+; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]])
+; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]])
+; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]]
+; PIC86: movl %[[LREG]], buf@GOTOFF+4
+; PIC86: ret
+; X64: sj0
+; x64: movq %rbp, buf(%rip)
+; x64: movq ${{.*LBB.*}}, buf+8(%rip)
+; X64: movq %rsp, buf+16(%rip)
+; X64: ret
+; PIC64: sj0
+; PIC64: movq %rbp, buf(%rip)
+; PIC64: movq %rsp, buf+16(%rip)
+; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]]
+; PIC64: movq %[[LREG]], buf+8(%rip)
+; PIC64: ret
+}
+
+define void @lj0() nounwind {
+  tail call void @llvm.eh.sjlj.longjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  unreachable
+; X86: lj0
+; X86: movl buf, %ebp
+; X86: movl buf+4, %[[REG32:.*]]
+; X86: movl buf+8, %esp
+; X86: jmpl *%[[REG32]]
+; X64: lj0
+; X64: movq buf(%rip), %rbp
+; X64: movq buf+8(%rip), %[[REG64:.*]]
+; X64: movq buf+16(%rip), %rsp
+; X64: jmpq *%[[REG64]]
+}
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 7ac3840482a23..2d0b2f7aa91d0 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -67,3 +67,17 @@ entry:
 ; CHECK: mull
 ; CHECK-NEXT: ret
 }
+
+declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63) nounwind readnone
+
+define i1 @test5() nounwind {
+entry:
+  %res = call { i63, i1 } @llvm.smul.with.overflow.i63(i63 4, i63 4611686018427387903)
+  %sum = extractvalue { i63, i1 } %res, 0
+  %overflow = extractvalue { i63, i1 } %res, 1
+  ret i1 %overflow
+; Was returning false, should return true (not constant folded yet though).
+; PR13991
+; CHECK: test5:
+; CHECK-NOT: xorb
+}
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
new file mode 100644
index 0000000000000..188505072f051
--- /dev/null
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 3839e875615f6..0ba02155a6570 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -47,8 +47,7 @@ define double @olt(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse:
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse:
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
@@ -65,8 +64,7 @@ define double @ogt_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse:
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse:
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
@@ -107,8 +105,7 @@ define double @ole(double %x, double %y) nounwind {
 ; CHECK:      oge_inverse:
 ; CHECK-NEXT: ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -123,8 +120,7 @@ define double @oge_inverse(double %x, double %y) nounwind {
 ; CHECK:      ole_inverse:
 ; CHECK-NEXT: ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -142,7 +138,8 @@ define double @ole_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -160,7 +157,8 @@ define double @ogt_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -218,7 +216,8 @@ define double @olt_inverse_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -234,7 +233,8 @@ define double @oge_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -313,8 +313,7 @@ define double @ult(double %x, double %y) nounwind {
 ; CHECK:      ugt_inverse:
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_inverse:
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse:
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
@@ -329,8 +328,7 @@ define double @ugt_inverse(double %x, double %y) nounwind {
 ; CHECK:      ult_inverse:
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_inverse:
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse:
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
@@ -378,8 +376,7 @@ define double @ule(double %x, double %y) nounwind {
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -395,8 +392,7 @@ define double @uge_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -412,7 +408,8 @@ define double @ule_inverse(double %x, double %y) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -428,7 +425,8 @@ define double @ugt_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -483,7 +481,8 @@ define double @ult_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -502,7 +501,8 @@ define double @uge_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -590,9 +590,7 @@ define double @olt_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -611,9 +609,7 @@ define double @ogt_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -657,9 +653,7 @@ define double @ole_y(double %x) nounwind {
 ; CHECK:      oge_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      oge_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -675,9 +669,7 @@ define double @oge_inverse_y(double %x) nounwind {
 ; CHECK:      ole_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ole_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -721,9 +713,7 @@ define double @ult_y(double %x) nounwind {
 ; CHECK:      ugt_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ugt_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -739,9 +729,7 @@ define double @ugt_inverse_y(double %x) nounwind {
 ; CHECK:      ult_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ult_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -792,9 +780,7 @@ define double @ule_y(double %x) nounwind {
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
@@ -811,9 +797,7 @@ define double @uge_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
new file mode 100644
index 0000000000000..655f75800cffa
--- /dev/null
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+
+; rdar: 12558838
+; PR14221
+; There is a mismatch between the intrinsic and the actual instruction.
+; The actual instruction has a partial update of dest, while the intrinsic
+; passes through the upper FP values. Here, we make sure the source and
+; destination of rsqrtss are the same.
+define void @t1(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t1:
+; CHECK: rsqrtss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare void @callee(double, double)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @t2(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t2:
+; CHECK: rcpss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index 70307534156e0..ecc253ba587e9 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin11.4.0"
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=core2 < %s | FileCheck %s
 
 declare i64 @testi()
 
@@ -93,4 +91,67 @@ define { i64, i64 } @crash(i8* %this) {
   ret { i64, i64 } %mrv7
 }
 
+; Check that we can fold an indexed load into a tail call instruction.
+; CHECK: fold_indexed_load
+; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
+; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  # TAILCALL
+%struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
+@func_table = external global [0 x %struct.funcs]
+define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
+entry:
+  %dsplen = getelementptr inbounds [0 x %struct.funcs]* @func_table, i64 0, i64 %idxprom, i32 2
+  %x1 = load i32 (i8*)** %dsplen, align 8
+  %call = tail call i32 %x1(i8* %mbstr) nounwind
+  ret void
+}
+
+; <rdar://problem/12282281> Fold an indexed load into the tail call instruction.
+; Calling a varargs function with 6 arguments requires 7 registers (%al is the
+; vector count for varargs functions). This leaves %r11 as the only available
+; scratch register.
+;
+; It is not possible to fold an indexed load into TCRETURNmi64 in that case.
+;
+; typedef int (*funcptr)(void*, ...);
+; extern const funcptr funcs[];
+; int f(int n) {
+;   return funcs[n](0, 0, 0, 0, 0, 0);
+; }
+;
+; CHECK: rdar12282281
+; CHECK: jmpq *%r11 # TAILCALL
+@funcs = external constant [0 x i32 (i8*, ...)*]
+
+define i32 @rdar12282281(i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
+  %0 = load i32 (i8*, ...)** %arrayidx, align 8
+  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  ret i32 %call
+}
+
+define x86_fp80 @fp80_call(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: fp80_call:
+; CHECK: jmp _fp80_callee
+  %call = tail call x86_fp80 @fp80_callee(x86_fp80 %x) nounwind
+  ret x86_fp80 %call
+}
+
+declare x86_fp80 @fp80_callee(x86_fp80)
+
+; rdar://12229511
+define x86_fp80 @trunc_fp80(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: trunc_fp80
+; CHECK: callq _trunc
+; CHECK-NOT: jmp _trunc
+; CHECK: ret
+  %conv = fptrunc x86_fp80 %x to double
+  %call = tail call double @trunc(double %conv) nounwind readnone
+  %conv1 = fpext double %call to x86_fp80
+  ret x86_fp80 %conv1
+}
 
+declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
index ba5f8f83619fa..a773e9daeff89 100644
--- a/test/CodeGen/X86/targetLoweringGeneric.ll
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=corei7 -fast-isel=false -O0 < %s | FileCheck %s
 
 ; Gather non-machine specific tests for the transformations in
 ; CodeGen/SelectionDAG/TargetLowering.  Currently, these
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d2363f8bf..b823f0af2cdf5 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 9877d7be169b2..1d22a185def38 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -2,8 +2,7 @@
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
-;CHECK: movzwl
-;CHECK: pshufb
+;CHECK: pmovzxbq
 ;CHECK: paddq
 ;CHECK: pshufb
 ; A single 16-bit store
@@ -19,8 +18,7 @@ define void @load_2_i8(<2 x i8>* %A)  {
 
 ;CHECK: load_2_i16
 ; Read 32-bits
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxwq
 ;CHECK: paddq
 ;CHECK: pshufb
 ;CHECK: movd
@@ -33,7 +31,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 } 
 
 ;CHECK: load_2_i32
-;CHECK: pshufd
+;CHECK: pmovzxdq
 ;CHECK: paddq
 ;CHECK: pshufd
 ;CHECK: ret
@@ -45,8 +43,7 @@ define void @load_2_i32(<2 x i32>* %A)  {
 } 
 
 ;CHECK: load_4_i8
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -58,7 +55,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 } 
 
 ;CHECK: load_4_i16
-;CHECK: punpcklwd
+;CHECK: pmovzxwd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -70,7 +67,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 } 
 
 ;CHECK: load_8_i8
-;CHECK: punpcklbw
+;CHECK: pmovzxbw
 ;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 46d6a23554f4a..4da79538dbf69 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: punpcklwd
-; CHECK: pshufd
+; CHECK: pmovzxwq
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
   %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 0000000000000..82517cb9a5a07
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 0000000000000..5e0160bd2856c
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e2e0c4f..dc0464ff9e0f5 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6bb114b7..4c56f848dedb4 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
index 1651c4cdace27..f5f88426058c6 100644
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ b/test/CodeGen/X86/vec_shuffle-30.ll
@@ -1,21 +1,25 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep pshufhw %t | grep -- -95 | count 1
-; RUN: grep shufps %t | count 1
-; RUN: not grep pslldq %t
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 
+; CHECK: test
 ; Test case when creating pshufhw, we incorrectly set the higher order bit
 ; for an undef,
 define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
 entry:
+; CHECK-NOT: vmovaps
+; CHECK: vmovlpd
+; CHECK: vpshufhw        $-95
   %0 = load <8 x i16>* %dest
   %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
   store <8 x i16> %1, <8 x i16>* %dest
   ret void
-}                              
+}
 
+; CHECK: test2
 ; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
 define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
 entry:
+; CHECK-NOT: pslldq
+; CHECK: shufps
   %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
   store <4 x i32> %0, <4 x i32>* %dest
   ret void
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index ebdfea9a37f7b..56c63644e02ef 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: paddd
 ; CHECK: movl
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d149ddcc3..dfaa3d6dc91a0 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 79aa00050254d..224898c1a3e50 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -170,7 +170,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
 define %i8vec3pack  @rot() nounwind {
-; CHECK: movd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
 entry:
   %X = alloca %i8vec3pack, align 4
   %rot = alloca %i8vec3pack, align 4
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
new file mode 100644
index 0000000000000..486dafeb5a242
--- /dev/null
+++ b/test/CodeGen/X86/xmulo.ll
@@ -0,0 +1,50 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+declare i32 @printf(i8*, ...)
+
+@.str = private unnamed_addr constant [10 x i8] c"%llx, %d\0A\00", align 1
+
+define i32 @t1() nounwind {
+; CHECK: t1:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $72, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t2() nounwind {
+; CHECK: t2:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $0, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t3() nounwind {
+; CHECK: t3:
+; CHECK:  movl $1, 12(%esp)
+; CHECK:  movl $-1, 8(%esp)
+; CHECK:  movl $-9, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
new file mode 100755
index 0000000000000..9a1d5383caacc
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
diff --git a/test/DebugInfo/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index db7bb0ad60302..559f032cb3a66 100644
--- a/test/DebugInfo/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll
@@ -1,6 +1,6 @@
-; RUN: llc -O0 -asm-verbose < %s > %t
-; RUN: grep "External Name" %t | grep -v X
-; RUN: grep "External Name" %t | grep Y | count 1
+; RUN: llc -O0 -asm-verbose -mtriple=x86_64-macosx < %s | FileCheck %s
+; CHECK-NOT: .asciz "X" ## External Name
+; CHECK: .asciz "Y" ## External Name
 ; Test to check type with no definition is listed in pubtypes section.
 %struct.X = type opaque
 %struct.Y = type { i32 }
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 59280e027f351..25b5f00c6af64 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -4,7 +4,8 @@
 ; Checks that we don't emit a size for a pointer type.
 ; CHECK: DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type
-; CHECK-NOT-NEXT: DW_AT_byte_size
+; CHECK-NOT: DW_AT_byte_size
+; CHECK: .debug_info contents
 
 %struct.A = type { i32 }
 
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
new file mode 100644
index 0000000000000..163a1e7cec731
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00bf => {0x000000bf})
+; CHECK: 0x000000bf:     DW_TAG_formal_parameter [12]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000085] = "this")
+
+%class.A = type { i32 }
+
+define i32 @_Z3foov() nounwind uwtable ssp {
+entry:
+  %a = alloca %class.A, align 4
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21), !dbg !23
+  call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
+  %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
+  %0 = load i32* %m_a, align 4, !dbg !25
+  ret i32 %0, !dbg !25
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26), !dbg !28
+  %this1 = load %class.A** %this.addr
+  call void @_ZN1AC2Ev(%class.A* %this1), !dbg !29
+  ret void, !dbg !29
+}
+
+define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30), !dbg !31
+  %this1 = load %class.A** %this.addr
+  %m_a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !32
+  store i32 0, i32* %m_a, align 4, !dbg !32
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !10, metadata !20}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!6 = metadata !{i32 786473, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !14, metadata !"m_a", metadata !6, i32 4, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !14, metadata !"A", metadata !"A", metadata !"", metadata !6, i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!21 = metadata !{i32 786688, metadata !22, metadata !"a", metadata !6, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!22 = metadata !{i32 786443, metadata !5, i32 7, i32 11, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!23 = metadata !{i32 8, i32 5, metadata !22, null}
+!24 = metadata !{i32 8, i32 6, metadata !22, null}
+!25 = metadata !{i32 9, i32 3, metadata !22, null}
+!26 = metadata !{i32 786689, metadata !10, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!27 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!28 = metadata !{i32 3, i32 3, metadata !10, null}
+!29 = metadata !{i32 3, i32 18, metadata !10, null}
+!30 = metadata !{i32 786689, metadata !20, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!31 = metadata !{i32 3, i32 3, metadata !20, null}
+!32 = metadata !{i32 3, i32 9, metadata !33, null}
+!33 = metadata !{i32 786443, metadata !20, i32 3, i32 7, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!34 = metadata !{i32 3, i32 18, metadata !33, null}
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index a22707189b086..58fb05573670f 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -7,16 +7,15 @@
 ; first check that we have a TAG_subprogram at a given offset and it has
 ; AT_inline.
 
-; CHECK: 0x00000134:   DW_TAG_subprogram [18]
-; CHECK-NEXT:     DW_AT_MIPS_linkage_name
+; CHECK: 0x0000011e:   DW_TAG_subprogram [18]
 ; CHECK-NEXT:     DW_AT_specification
 ; CHECK-NEXT:     DW_AT_inline
 
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
-; CHECK: 0x00000184:   DW_TAG_subprogram [20]
-; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x0134 => {0x00000134})
+; CHECK: 0x0000015f:   DW_TAG_subprogram [20]
+; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x011e => {0x0000011e})
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
 entry:
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
new file mode 100644
index 0000000000000..b908bcefe4781
--- /dev/null
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -0,0 +1,109 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: 0x0000000b: DW_TAG_compile_unit
+; CHECK: 0x00000012:   DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
+; CHECK: 0x0000003c:   DW_TAG_class_type
+; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
+; CHECK: 0x00000044:     DW_TAG_member
+; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
+
+%class.D = type { i32, i32, i32, i32 }
+
+@_ZN1DC1Ev = alias void (%class.D*)* @_ZN1DC2Ev
+@_ZN1DC1ERKS_ = alias void (%class.D*, %class.D*)* @_ZN1DC2ERKS_
+
+define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29), !dbg !36
+  %c1 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !37
+  store i32 1, i32* %c1, align 4, !dbg !37, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !42
+  store i32 2, i32* %c2, align 4, !dbg !42, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !43
+  store i32 3, i32* %c3, align 4, !dbg !43, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !44
+  store i32 4, i32* %c4, align 4, !dbg !44, !tbaa !39
+  ret void, !dbg !45
+}
+
+define void @_ZN1DC2ERKS_(%class.D* nocapture %this, %class.D* nocapture %d) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35), !dbg !46
+  %c1 = getelementptr inbounds %class.D* %d, i64 0, i32 0, !dbg !47
+  %0 = load i32* %c1, align 4, !dbg !47, !tbaa !39
+  %c12 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !47
+  store i32 %0, i32* %c12, align 4, !dbg !47, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %d, i64 0, i32 1, !dbg !49
+  %1 = load i32* %c2, align 4, !dbg !49, !tbaa !39
+  %c23 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !49
+  store i32 %1, i32* %c23, align 4, !dbg !49, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %d, i64 0, i32 2, !dbg !50
+  %2 = load i32* %c3, align 4, !dbg !50, !tbaa !39
+  %c34 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !50
+  store i32 %2, i32* %c34, align 4, !dbg !50, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %d, i64 0, i32 3, !dbg !51
+  %3 = load i32* %c4, align 4, !dbg !51, !tbaa !39
+  %c45 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !51
+  store i32 %3, i32* %c45, align 4, !dbg !51, !tbaa !39
+  ret void, !dbg !52
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !31}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", metadata !6, i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = metadata !{i32 786434, null, metadata !"D", metadata !6, i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [from ]
+!11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
+!12 = metadata !{i32 786445, metadata !10, metadata !"c1", metadata !6, i32 6, i64 32, i64 32, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
+!13 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{i32 786445, metadata !10, metadata !"c2", metadata !6, i32 7, i64 32, i64 32, i64 32, i32 1, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
+!15 = metadata !{i32 786445, metadata !10, metadata !"c3", metadata !6, i32 8, i64 32, i64 32, i64 64, i32 1, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
+!16 = metadata !{i32 786445, metadata !10, metadata !"c4", metadata !6, i32 9, i64 32, i64 32, i64 96, i32 1, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [D]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 4, metadata !21, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !25, i32 4} ; [ DW_TAG_subprogram ] [line 4] [D]
+!21 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null, metadata !9, metadata !23}
+!23 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
+!25 = metadata !{metadata !26}
+!26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!27 = metadata !{metadata !28}
+!28 = metadata !{metadata !29}
+!29 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777228, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 12]
+!30 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!31 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2ERKS_", metadata !6, i32 19, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
+!32 = metadata !{metadata !33}
+!33 = metadata !{metadata !34, metadata !35}
+!34 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777235, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 19]
+!35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
+!36 = metadata !{i32 12, i32 0, metadata !5, null}
+!37 = metadata !{i32 13, i32 0, metadata !38, null}
+!38 = metadata !{i32 786443, metadata !5, i32 12, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!39 = metadata !{metadata !"int", metadata !40}
+!40 = metadata !{metadata !"omnipotent char", metadata !41}
+!41 = metadata !{metadata !"Simple C/C++ TBAA"}
+!42 = metadata !{i32 14, i32 0, metadata !38, null}
+!43 = metadata !{i32 15, i32 0, metadata !38, null}
+!44 = metadata !{i32 16, i32 0, metadata !38, null}
+!45 = metadata !{i32 17, i32 0, metadata !38, null}
+!46 = metadata !{i32 19, i32 0, metadata !31, null}
+!47 = metadata !{i32 20, i32 0, metadata !48, null}
+!48 = metadata !{i32 786443, metadata !31, i32 19, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!49 = metadata !{i32 21, i32 0, metadata !48, null}
+!50 = metadata !{i32 22, i32 0, metadata !48, null}
+!51 = metadata !{i32 23, i32 0, metadata !48, null}
+!52 = metadata !{i32 24, i32 0, metadata !48, null}
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index c2dacea4839a6..0902430008c18 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -5,16 +5,14 @@
 
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157772) (llvm/trunk 157761)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !6, metadata !6, metadata !7} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/tmp", metadata !"clang version 3.2 (trunk 165274) (llvm/trunk 165272)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !2}
-!2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, null, metadata !"E", metadata !4, i32 1, i64 16, i64 16, i32 0, i32 4, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
-!4 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 0}
-!6 = metadata !{metadata !5}
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !4, i32 2, metadata !3, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ]
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ] [e] [line 2] [def]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786436, null, metadata !"E", metadata !6, i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [fwd] [from ]
 
 ; CHECK: DW_TAG_enumeration_type
 ; CHECK-NEXT: DW_AT_name
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
new file mode 100644
index 0000000000000..b98492383ac30
--- /dev/null
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=x86_64-macosx -darwin-gdb-compat=Disable %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_TAG_subprogram [9] *
+; CHECK-NOT: DW_AT_MIPS_linkage_name
+; CHECK: DW_AT_specification
+
+%class.A = type { i8 }
+
+@a = global %class.A zeroinitializer, align 1
+
+define i32 @_ZN1A1aEi(%class.A* %this, i32 %b) nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  %b.addr = alloca i32, align 4
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21), !dbg !23
+  store i32 %b, i32* %b.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24), !dbg !25
+  %this1 = load %class.A** %this.addr
+  %0 = load i32* %b.addr, align 4, !dbg !26
+  ret i32 %0, !dbg !26
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{metadata !9, metadata !10, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null} ; [ DW_TAG_class_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, i32 0, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !14} ; [ DW_TAG_subprogram ]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!18 = metadata !{metadata !19}
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ]
+!21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 5, i32 8, metadata !5, null}
+!24 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554437, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{i32 5, i32 14, metadata !5, null}
+!26 = metadata !{i32 6, i32 4, metadata !27, null}
+!27 = metadata !{i32 786443, metadata !5, i32 5, i32 17, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
deleted file mode 100644
index e820cb564cf01..0000000000000
--- a/test/DebugInfo/X86/pr13303.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc %s -o %t -filetype=obj -mtriple=x86_64-unknown-linux-gnu
-; RUN: llvm-dwarfdump %t | FileCheck %s
-; PR13303
-
-; Check that the prologue ends with is_stmt here.
-; CHECK: 0x0000000000000000 {{.*}} is_stmt
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  ret i32 0, !dbg !10
-}
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"PR13303.c", metadata !"/home/probinson", metadata !"clang version 3.2 (trunk 160143)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!6 = metadata !{i32 786473, metadata !"PR13303.c", metadata !"/home/probinson", null} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 1, i32 14, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !5, i32 1, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
new file mode 100644
index 0000000000000..929db51902670
--- /dev/null
+++ b/test/DebugInfo/X86/prologue-stack.ll
@@ -0,0 +1,35 @@
+; RUN: llc -disable-fp-elim -O0 %s -mtriple x86_64-unknown-linux-gnu -o - | FileCheck %s
+
+; int callme(int);
+; int isel_line_test2() {
+;   callme(400);
+;   return 0;
+; }
+
+define i32 @isel_line_test2() nounwind uwtable {
+  ; The stack adjustment should be part of the prologue.
+  ; CHECK: isel_line_test2:
+  ; CHECK: {{subq|leaq}} {{.*}}, %rsp
+  ; CHECK: .loc 1 5 3 prologue_end
+entry:
+  %call = call i32 @callme(i32 400), !dbg !10
+  ret i32 0, !dbg !12
+}
+
+declare i32 @callme(i32)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.2 (trunk 164980) (llvm/trunk 164979)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"isel_line_test2", metadata !"isel_line_test2", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @isel_line_test2, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
+!6 = metadata !{i32 786473, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 5, i32 3, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 4, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
+!12 = metadata !{i32 6, i32 3, metadata !11, null}
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 2cd100156aada..caf12c2756e0f 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -16,8 +16,8 @@
 
 ; Verify that we refer to 'yyyy' with a relocation.
 ; LINUX:      .long   .Lstring3               # DW_AT_name
-; LINUX-NEXT: .long   39                      # DW_AT_type
-; LINUX-NEXT: .byte   1                       # DW_AT_external
+; LINUX-NEXT: .long   38                      # DW_AT_type
+; LINUX-NEXT:                                 # DW_AT_external
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_file
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_line
 ; LINUX-NEXT: .byte   9                       # DW_AT_location
diff --git a/test/DebugInfo/bug_null_debuginfo.ll b/test/DebugInfo/bug_null_debuginfo.ll
index a7fdf70d71c79..b17affed893cb 100644
--- a/test/DebugInfo/bug_null_debuginfo.ll
+++ b/test/DebugInfo/bug_null_debuginfo.ll
@@ -1,5 +1,4 @@
-; RUN: llc
-
+; RUN: llc < %s
 
 !llvm.dbg.cu = !{!0}
 
diff --git a/test/DebugInfo/dwarfdump-inlining.test b/test/DebugInfo/dwarfdump-inlining.test
new file mode 100644
index 0000000000000..d3a7e12a87036
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-inlining.test
@@ -0,0 +1,28 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x613 \
+RUN:   --inlining --functions | FileCheck %s -check-prefix DEEP_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x6de \
+RUN:   --inlining | FileCheck %s -check-prefix SHORTER_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x685 \
+RUN:   --inlining | FileCheck %s -check-prefix SHORT_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x640 \
+RUN:   --functions | FileCheck %s -check-prefix INL_FUNC_NAME
+
+DEEP_STACK:      inlined_h
+DEEP_STACK-NEXT: header.h:2:21
+DEEP_STACK-NEXT: inlined_g
+DEEP_STACK-NEXT: header.h:7
+DEEP_STACK-NEXT: inlined_f
+DEEP_STACK-NEXT: main.cc:3
+DEEP_STACK-NEXT: main
+DEEP_STACK-NEXT: main.cc:8
+
+SHORTER_STACK:      header.h:7:20
+SHORTER_STACK-NEXT: main.cc:3
+SHORTER_STACK-NEXT: main.cc:8
+
+SHORT_STACK:      main.cc:3:20
+SHORT_STACK-NEXT: main.cc:8
+
+INL_FUNC_NAME:      inlined_g
+INL_FUNC_NAME-NEXT: header.h:7:20
+
diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
index de23dcd9c2786..973c3447e340a 100644
--- a/test/DebugInfo/dwarfdump-test.test
+++ b/test/DebugInfo/dwarfdump-test.test
@@ -17,6 +17,8 @@ RUN:   --address=0x56d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
 RUN:   --address=0x55c --functions \
 RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
+RUN:   | FileCheck %s -check-prefix DEBUG_RANGES
 
 MAIN: main
 MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16:10
@@ -44,3 +46,11 @@ INCLUDE_TEST_2-NEXT: /tmp/include{{[/\\]}}decl.h:5:0
 
 MANY_SEQ_IN_LINE_TABLE: _Z1cv
 MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo/sequences{{[/\\]}}c.cc:2:0
+
+DEBUG_RANGES:      .debug_ranges contents:
+DEBUG_RANGES-NEXT: 00000000 000000000000055c 0000000000000567
+DEBUG_RANGES-NEXT: 00000000 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000000 <End of list>
+DEBUG_RANGES-NEXT: 00000030 0000000000000570 000000000000057b
+DEBUG_RANGES-NEXT: 00000030 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000030 <End of list>
diff --git a/test/ExecutionEngine/2002-12-16-ArgTest.ll b/test/ExecutionEngine/2002-12-16-ArgTest.ll
index eb2fe8c048320..4c03519a85afa 100644
--- a/test/ExecutionEngine/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/2002-12-16-ArgTest.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli %s > /dev/null
+; XFAIL: arm
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
index 46273d3400952..28cc54a868061 100644
--- a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
index 88bfbb3c09bb2..9f895983fdb16 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @foo(i32 %X, i32 %Y, double %A) {
 	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
index d5f860d17048d..997b2a9037ee3 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
index 721f2e8859dcd..ba35b5bcc436e 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
index d17df997c817c..f3c88adf84355 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; We were accidentally inverting the signedness of right shifts.  Whoops.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
index e55cb06aa1e6b..f925e79f24849 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
index 663dc40010795..5b426f6c330b5 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @bar(i8* %X) {
         ; pointer should be 4 byte aligned!
diff --git a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
index e95294be74a41..c0a7393f8244f 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
@@ -1,6 +1,6 @@
 ; This testcase should return with an exit code of 1.
 ;
-; RUN: not %lli -use-mcjit %s
+; RUN: not %lli -mtriple=%mcjit_triple -use-mcjit %s
 
 @test = global i64 0		; <i64*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
index a237194ea48f3..d3e6204a85be1 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s test
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s test
 
 declare i32 @puts(i8*)
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
index 70464a3ffcb79..55a1697541044 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
index 58d423f924418..79c6e7fe4caee 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
index a22fe07b08599..ffd6df6e5e254 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
index b3c6d8abbc064..90839e96986ff 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This testcase failed to work because two variable sized allocas confused the
 ; local register allocator.
diff --git a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
index bd32f3037ddcf..29ef2c556cd01 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ;
 ; Regression Test: EnvironmentTest.ll
diff --git a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
index 1959534b877af..2adb608acbb18 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This testcase exposes a bug in the local register allocator where it runs out
 ; of registers (due to too many overlapping live ranges), but then attempts to
diff --git a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
index 1f8343fc43f55..91bde46903614 100644
--- a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @A = global i32 0		; <i32*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
index 79a7d583ce616..a7462d9e698a7 100644
--- a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
+++ b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
@@ -1,5 +1,5 @@
 ; PR672
-; RUN: %lli -use-mcjit %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s
 ; XFAIL: mcjit-ia32
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
index 52cef4d35ca65..2406596602522 100644
--- a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
+++ b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter %s
 ; PR1836
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
index a6e917f457b3c..d429d519e04f0 100644
--- a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
+++ b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 1
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 1
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
index 524a724c474ba..a6d18e7919ccb 100644
--- a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
+++ b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s > /dev/null
 
 define i32 @main() {
        %a = add i32 0, undef
diff --git a/test/ExecutionEngine/MCJIT/fpbitcast.ll b/test/ExecutionEngine/MCJIT/fpbitcast.ll
index 9da908f8cff1a..bb4957e9e66ed 100644
--- a/test/ExecutionEngine/MCJIT/fpbitcast.ll
+++ b/test/ExecutionEngine/MCJIT/fpbitcast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 40091eb8
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 40091eb8
 ;
 define i32 @test(double %x) {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/hello.ll b/test/ExecutionEngine/MCJIT/hello.ll
index a52b6d48af293..ceb9c12ab4bd8 100644
--- a/test/ExecutionEngine/MCJIT/hello.ll
+++ b/test/ExecutionEngine/MCJIT/hello.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/hello2.ll b/test/ExecutionEngine/MCJIT/hello2.ll
index 670a6dd671cec..756fcadb1cafa 100644
--- a/test/ExecutionEngine/MCJIT/hello2.ll
+++ b/test/ExecutionEngine/MCJIT/hello2.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @X = global i32 7		; <i32*> [#uses=0]
 @msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index 2980ce70811c1..fc29f651aa1f4 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -8,13 +8,17 @@ def getRoot(config):
 root = getRoot(config)
 
 targets = set(root.targets_to_build.split())
-if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets):
+if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets) | \
+   ('PowerPC' in targets):
     config.unsupported = False
 else:
     config.unsupported = True
 
-if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips']:
+if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips', 'PowerPC']:
     config.unsupported = True
 
-if root.host_os in ['Win32', 'Cygwin', 'MingW', 'Windows', 'Darwin']:
+if root.host_os in ['Darwin']:
+    config.unsupported = True
+
+if 'powerpc' in root.target_triple and not 'powerpc64' in root.target_triple:
     config.unsupported = True
diff --git a/test/ExecutionEngine/MCJIT/pr13727.ll b/test/ExecutionEngine/MCJIT/pr13727.ll
new file mode 100644
index 0000000000000..c33bf32810872
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/pr13727.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.1)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.100000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/MCJIT/simplesttest.ll b/test/ExecutionEngine/MCJIT/simplesttest.ll
index a6688c237c0e5..02ad0061fd13f 100644
--- a/test/ExecutionEngine/MCJIT/simplesttest.ll
+++ b/test/ExecutionEngine/MCJIT/simplesttest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/simpletest.ll b/test/ExecutionEngine/MCJIT/simpletest.ll
index 4562aa6012ef6..958b783067e48 100644
--- a/test/ExecutionEngine/MCJIT/simpletest.ll
+++ b/test/ExecutionEngine/MCJIT/simpletest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @bar() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/stubs.ll b/test/ExecutionEngine/MCJIT/stubs.ll
index b285b0eadb3fc..9e5d5b2e4186a 100644
--- a/test/ExecutionEngine/MCJIT/stubs.ll
+++ b/test/ExecutionEngine/MCJIT/stubs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -disable-lazy-compilation=false %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -disable-lazy-compilation=false %s
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-arith.ll b/test/ExecutionEngine/MCJIT/test-arith.ll
index 31777604d577f..b73227fe635ea 100644
--- a/test/ExecutionEngine/MCJIT/test-arith.ll
+++ b/test/ExecutionEngine/MCJIT/test-arith.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = add i8 0, 12		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-branch.ll b/test/ExecutionEngine/MCJIT/test-branch.ll
index 702c11022094f..8f3c7279051ee 100644
--- a/test/ExecutionEngine/MCJIT/test-branch.ll
+++ b/test/ExecutionEngine/MCJIT/test-branch.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test unconditional branch
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
index 6f284055fd92f..20150b2de626c 100644
--- a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @_Z14func_exit_codev() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-call.ll b/test/ExecutionEngine/MCJIT/test-call.ll
index 7a244ee505813..51d19fe991787 100644
--- a/test/ExecutionEngine/MCJIT/test-call.ll
+++ b/test/ExecutionEngine/MCJIT/test-call.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 declare void @exit(i32)
 
diff --git a/test/ExecutionEngine/MCJIT/test-cast.ll b/test/ExecutionEngine/MCJIT/test-cast.ll
index 75e7d1b423f39..dcc97f466568c 100644
--- a/test/ExecutionEngine/MCJIT/test-cast.ll
+++ b/test/ExecutionEngine/MCJIT/test-cast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @foo() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
new file mode 100644
index 0000000000000..d666a2aa4aa35
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+; This test checks that common symbols have been allocated addresses honouring
+; the alignment requirement.
+
+@CS1 = common global i32 0, align 16
+@CS2 = common global i8 0, align 1
+@CS3 = common global i32 0, align 16
+
+define i32 @main() nounwind {
+entry:
+    %retval = alloca i32, align 4
+    %ptr = alloca i32, align 4
+    store i32 0, i32* %retval
+    store i32 ptrtoint (i32* @CS3 to i32), i32* %ptr, align 4
+    %0 = load i32* %ptr, align 4
+    %and = and i32 %0, 15
+    %tobool = icmp ne i32 %and, 0
+    br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+    store i32 1, i32* %retval
+    br label %return
+
+if.else:                                          ; preds = %entry
+    store i32 0, i32* %retval
+    br label %return
+
+return:                                           ; preds = %if.else, %if.then
+    %1 = load i32* %retval
+    ret i32 %1
+}
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols.ll b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
index ac1d9acd954ef..8c8190291f18f 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/test-constantexpr.ll b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
index 6b46639c51f9e..56c1290448ade 100644
--- a/test/ExecutionEngine/MCJIT/test-constantexpr.ll
+++ b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This tests to make sure that we can evaluate weird constant expressions
 
diff --git a/test/ExecutionEngine/MCJIT/test-data-align.ll b/test/ExecutionEngine/MCJIT/test-data-align.ll
new file mode 100644
index 0000000000000..0493cba87fdbe
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-data-align.ll
@@ -0,0 +1,15 @@
+; RUN:  %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+; Check that a variable is always aligned as specified.
+
+@var = global i32 0, align 32
+define i32 @main() {
+  %addr = ptrtoint i32* @var to i64
+  %mask = and i64 %addr, 31
+  %tst = icmp eq i64 %mask, 0
+  br i1 %tst, label %good, label %bad
+good:
+  ret i32 0
+bad:
+  ret i32 1
+}
diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
index 35491df791770..7af1d8b539107 100644
--- a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-fp.ll b/test/ExecutionEngine/MCJIT/test-fp.ll
index 6fc5a501f6e64..f7e6fb9ba18e2 100644
--- a/test/ExecutionEngine/MCJIT/test-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
index 4a790c6ff1743..ec6cbad2f14ec 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-global.ll b/test/ExecutionEngine/MCJIT/test-global.ll
index 94e0250769ec0..e7972f978e95c 100644
--- a/test/ExecutionEngine/MCJIT/test-global.ll
+++ b/test/ExecutionEngine/MCJIT/test-global.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @count = global i32 0, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-loadstore.ll b/test/ExecutionEngine/MCJIT/test-loadstore.ll
index e9171490e3523..f450d0ab528b1 100644
--- a/test/ExecutionEngine/MCJIT/test-loadstore.ll
+++ b/test/ExecutionEngine/MCJIT/test-loadstore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
 	%V = load i8* %P		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-local.ll b/test/ExecutionEngine/MCJIT/test-local.ll
index 4f5ae47dd0480..d4e9f444e4269 100644
--- a/test/ExecutionEngine/MCJIT/test-local.ll
+++ b/test/ExecutionEngine/MCJIT/test-local.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-logical.ll b/test/ExecutionEngine/MCJIT/test-logical.ll
index 0540c22fc6297..32f45ef119e6d 100644
--- a/test/ExecutionEngine/MCJIT/test-logical.ll
+++ b/test/ExecutionEngine/MCJIT/test-logical.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = and i8 4, 8		; <i8> [#uses=2]
diff --git a/test/ExecutionEngine/MCJIT/test-loop.ll b/test/ExecutionEngine/MCJIT/test-loop.ll
index b1dbf408996be..ebc689664d655 100644
--- a/test/ExecutionEngine/MCJIT/test-loop.ll
+++ b/test/ExecutionEngine/MCJIT/test-loop.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/test-phi.ll b/test/ExecutionEngine/MCJIT/test-phi.ll
index fbc080862c835..1408533d7ae90 100644
--- a/test/ExecutionEngine/MCJIT/test-phi.ll
+++ b/test/ExecutionEngine/MCJIT/test-phi.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test phi node
 @Y = global i32 6		; <i32*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
new file mode 100644
index 0000000000000..93b6a6deffd11
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-ret.ll b/test/ExecutionEngine/MCJIT/test-ret.ll
index 1b90ee0750691..af282926907fb 100644
--- a/test/ExecutionEngine/MCJIT/test-ret.ll
+++ b/test/ExecutionEngine/MCJIT/test-ret.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test return instructions
 define void @test1() {
diff --git a/test/ExecutionEngine/MCJIT/test-return.ll b/test/ExecutionEngine/MCJIT/test-return.ll
index 9c399cab38d9c..67f7107c3d7dd 100644
--- a/test/ExecutionEngine/MCJIT/test-return.ll
+++ b/test/ExecutionEngine/MCJIT/test-return.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
index 030ff317560b5..a8f4bd8529f8b 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-int.ll b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
index 1113efee510fe..ed52b5065c844 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-int.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%int1 = add i32 0, 0		; <i32> [#uses=6]
diff --git a/test/ExecutionEngine/MCJIT/test-shift.ll b/test/ExecutionEngine/MCJIT/test-shift.ll
index 2da824fecce98..5a5c10d560506 100644
--- a/test/ExecutionEngine/MCJIT/test-shift.ll
+++ b/test/ExecutionEngine/MCJIT/test-shift.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%shamt = add i8 0, 1		; <i8> [#uses=8]
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index 19eebc0ac7ac3..f0343263dba61 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1 +1,12 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+if root.host_arch in ['PowerPC']:
+    config.unsupported = True
+
diff --git a/test/ExecutionEngine/test-fp-no-external-funcs.ll b/test/ExecutionEngine/test-fp-no-external-funcs.ll
index 61b12c2abeb72..139b2efea57fe 100644
--- a/test/ExecutionEngine/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/test-fp-no-external-funcs.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli  %s > /dev/null
+; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/test-fp.ll b/test/ExecutionEngine/test-fp.ll
index 2bf0210d8b002..c9064500d475a 100644
--- a/test/ExecutionEngine/test-fp.ll
+++ b/test/ExecutionEngine/test-fp.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli %s > /dev/null
+; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/Feature/linker_private_linkages.ll b/test/Feature/linker_private_linkages.ll
index f9f2908756451..19bcbb40aa017 100644
--- a/test/Feature/linker_private_linkages.ll
+++ b/test/Feature/linker_private_linkages.ll
@@ -4,4 +4,3 @@
 
 @foo = linker_private hidden global i32 0
 @bar = linker_private_weak hidden global i32 0
-@qux = linker_private_weak_def_auto global i32 0
diff --git a/test/Feature/minsize_attr.ll b/test/Feature/minsize_attr.ll
new file mode 100644
index 0000000000000..51b133c4bdb75
--- /dev/null
+++ b/test/Feature/minsize_attr.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define void @test1() minsize {
+; CHECK: define void @test1() minsize
+        ret void
+}
+
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index d190001870600..655f69c16fdf3 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -69,3 +69,23 @@ entry:
   store i32 42, i32* %a
   ret void
 }
+
+; Check that asan leaves just one alloca.
+
+declare void @alloca_test_use([10 x i8]*)
+define void @alloca_test() address_safety {
+entry:
+  %x = alloca [10 x i8], align 1
+  %y = alloca [10 x i8], align 1
+  %z = alloca [10 x i8], align 1
+  call void @alloca_test_use([10 x i8]* %x)
+  call void @alloca_test_use([10 x i8]* %y)
+  call void @alloca_test_use([10 x i8]* %z)
+  ret void
+}
+
+; CHECK: define void @alloca_test()
+; CHECK: = alloca
+; CHECK-NOT: = alloca
+; CHECK: ret void
+
diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
new file mode 100644
index 0000000000000..28d4ac0c0f589
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
@@ -0,0 +1,19 @@
+; This test checks that we are not instrumenting globals
+; that we created ourselves.
+; RUN: opt < %s -asan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z3barv() uwtable address_safety {
+entry:
+  %a = alloca i32, align 4
+  call void @_Z3fooPi(i32* %a)
+  ret void
+}
+
+declare void @_Z3fooPi(i32*)
+; We create one global string constant for the stack frame above.
+; Make sure we don't create any other global constants.
+; CHECK: = private constant
+; CHECK-NOT: = private constant
diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index ba8d65a4fa4bf..3d92946087ecf 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; If a global is present, __asan_[un]register_globals should be called from
 ; module ctor/dtor
 
-; CHECK: llvm.global_dtors
 ; CHECK: llvm.global_ctors
+; CHECK: llvm.global_dtors
 
 ; CHECK: define internal void @asan.module_ctor
 ; CHECK-NOT: ret
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
new file mode 100644
index 0000000000000..472551654e539
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -asan -asan-initialization-order -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+@xxx = global i32 0, align 4
+; Clang will emit the following metadata identifying @xxx as dynamically
+; initialized.
+!0 = metadata !{i32* @xxx}
+!llvm.asan.dynamically_initialized_globals = !{!0}
+
+define i32 @initializer() uwtable {
+entry:
+  ret i32 42
+}
+
+define internal void @__cxx_global_var_init() section ".text.startup" {
+entry:
+  %call = call i32 @initializer()
+  store i32 %call, i32* @xxx, align 4
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() address_safety section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+; Clang indicated that @xxx was dynamically initailized.
+; __asan_{before,after}_dynamic_init should be called from _GLOBAL__I_a
+
+; CHECK: define internal void @_GLOBAL__I_a
+; CHECK-NOT: ret
+; CHECK: call void @__asan_before_dynamic_init
+; CHECK: call void @__cxx_global_var_init
+; CHECK: call void @__asan_after_dynamic_init
+; CHECK: ret
diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll
index 02bf215c6bffd..107dbdc0f227b 100644
--- a/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -8,7 +8,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_unordered
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -16,7 +16,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_monotonic
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_acquire(i8* %a) nounwind uwtable {
 entry:
@@ -24,7 +24,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_acquire
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 4)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 2)
 
 define i8 @atomic8_load_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -32,7 +32,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_seq_cst
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 32)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 5)
 
 define void @atomic8_store_unordered(i8* %a) nounwind uwtable {
 entry:
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_unordered
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -48,7 +48,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_monotonic
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_release(i8* %a) nounwind uwtable {
 entry:
@@ -56,7 +56,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_release
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 8)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 3)
 
 define void @atomic8_store_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -64,7 +64,287 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_seq_cst
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 32)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xchg_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xchg_monotonic
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 0)
+
+define void @atomic8_add_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_add_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 0)
+
+define void @atomic8_sub_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_sub_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 0)
+
+define void @atomic8_and_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_and_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 0)
+
+define void @atomic8_or_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_or_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xor_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xor_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xchg_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xchg_acquire
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 2)
+
+define void @atomic8_add_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_add_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 2)
+
+define void @atomic8_sub_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_sub_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 2)
+
+define void @atomic8_and_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_and_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 2)
+
+define void @atomic8_or_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_or_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xor_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xor_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xchg_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xchg_release
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 3)
+
+define void @atomic8_add_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_add_release
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 3)
+
+define void @atomic8_sub_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_sub_release
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 3)
+
+define void @atomic8_and_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_and_release
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 3)
+
+define void @atomic8_or_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_or_release
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xor_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xor_release
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xchg_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xchg_acq_rel
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 4)
+
+define void @atomic8_add_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_add_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 4)
+
+define void @atomic8_sub_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_sub_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 4)
+
+define void @atomic8_and_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_and_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 4)
+
+define void @atomic8_or_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_or_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xor_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xor_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xchg_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xchg_seq_cst
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 5)
+
+define void @atomic8_add_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_add_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 5)
+
+define void @atomic8_sub_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_sub_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 5)
+
+define void @atomic8_and_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_and_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 5)
+
+define void @atomic8_or_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_or_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xor_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xor_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 5)
+
+define void @atomic8_cas_monotonic(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 monotonic
+  ret void
+}
+; CHECK: atomic8_cas_monotonic
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 0)
+
+define void @atomic8_cas_acquire(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acquire
+  ret void
+}
+; CHECK: atomic8_cas_acquire
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 2)
+
+define void @atomic8_cas_release(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 release
+  ret void
+}
+; CHECK: atomic8_cas_release
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 3)
+
+define void @atomic8_cas_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acq_rel
+  ret void
+}
+; CHECK: atomic8_cas_acq_rel
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 4)
+
+define void @atomic8_cas_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 seq_cst
+  ret void
+}
+; CHECK: atomic8_cas_seq_cst
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 5)
 
 define i16 @atomic16_load_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -72,7 +352,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_unordered
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -80,7 +360,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_monotonic
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_acquire(i16* %a) nounwind uwtable {
 entry:
@@ -88,7 +368,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_acquire
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 4)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 2)
 
 define i16 @atomic16_load_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -96,7 +376,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_seq_cst
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 32)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 5)
 
 define void @atomic16_store_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -104,7 +384,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_unordered
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -112,7 +392,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_monotonic
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_release(i16* %a) nounwind uwtable {
 entry:
@@ -120,7 +400,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_release
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 8)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 3)
 
 define void @atomic16_store_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -128,7 +408,287 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_seq_cst
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 32)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xchg_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xchg_monotonic
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 0)
+
+define void @atomic16_add_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_add_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 0)
+
+define void @atomic16_sub_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_sub_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 0)
+
+define void @atomic16_and_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_and_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 0)
+
+define void @atomic16_or_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_or_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xor_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xor_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xchg_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xchg_acquire
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 2)
+
+define void @atomic16_add_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_add_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 2)
+
+define void @atomic16_sub_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_sub_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 2)
+
+define void @atomic16_and_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_and_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 2)
+
+define void @atomic16_or_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_or_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xor_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xor_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xchg_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xchg_release
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 3)
+
+define void @atomic16_add_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_add_release
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 3)
+
+define void @atomic16_sub_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_sub_release
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 3)
+
+define void @atomic16_and_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_and_release
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 3)
+
+define void @atomic16_or_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_or_release
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xor_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xor_release
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xchg_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xchg_acq_rel
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 4)
+
+define void @atomic16_add_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_add_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 4)
+
+define void @atomic16_sub_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_sub_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 4)
+
+define void @atomic16_and_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_and_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 4)
+
+define void @atomic16_or_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_or_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xor_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xor_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xchg_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xchg_seq_cst
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 5)
+
+define void @atomic16_add_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_add_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 5)
+
+define void @atomic16_sub_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_sub_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 5)
+
+define void @atomic16_and_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_and_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 5)
+
+define void @atomic16_or_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_or_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xor_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xor_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 5)
+
+define void @atomic16_cas_monotonic(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 monotonic
+  ret void
+}
+; CHECK: atomic16_cas_monotonic
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 0)
+
+define void @atomic16_cas_acquire(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acquire
+  ret void
+}
+; CHECK: atomic16_cas_acquire
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 2)
+
+define void @atomic16_cas_release(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 release
+  ret void
+}
+; CHECK: atomic16_cas_release
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 3)
+
+define void @atomic16_cas_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acq_rel
+  ret void
+}
+; CHECK: atomic16_cas_acq_rel
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 4)
+
+define void @atomic16_cas_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 seq_cst
+  ret void
+}
+; CHECK: atomic16_cas_seq_cst
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 5)
 
 define i32 @atomic32_load_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -136,7 +696,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_unordered
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -144,7 +704,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_monotonic
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_acquire(i32* %a) nounwind uwtable {
 entry:
@@ -152,7 +712,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_acquire
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 4)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 2)
 
 define i32 @atomic32_load_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -160,7 +720,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_seq_cst
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 32)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 5)
 
 define void @atomic32_store_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -168,7 +728,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_unordered
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -176,7 +736,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_monotonic
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_release(i32* %a) nounwind uwtable {
 entry:
@@ -184,7 +744,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_release
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 8)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 3)
 
 define void @atomic32_store_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -192,7 +752,287 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_seq_cst
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 32)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xchg_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xchg_monotonic
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 0)
+
+define void @atomic32_add_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_add_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 0)
+
+define void @atomic32_sub_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_sub_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 0)
+
+define void @atomic32_and_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_and_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 0)
+
+define void @atomic32_or_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_or_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xor_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xor_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xchg_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xchg_acquire
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 2)
+
+define void @atomic32_add_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_add_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 2)
+
+define void @atomic32_sub_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_sub_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 2)
+
+define void @atomic32_and_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_and_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 2)
+
+define void @atomic32_or_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_or_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xor_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xor_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xchg_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xchg_release
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 3)
+
+define void @atomic32_add_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_add_release
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 3)
+
+define void @atomic32_sub_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_sub_release
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 3)
+
+define void @atomic32_and_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_and_release
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 3)
+
+define void @atomic32_or_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_or_release
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xor_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xor_release
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xchg_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xchg_acq_rel
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 4)
+
+define void @atomic32_add_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_add_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 4)
+
+define void @atomic32_sub_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_sub_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 4)
+
+define void @atomic32_and_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_and_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 4)
+
+define void @atomic32_or_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_or_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xor_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xor_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xchg_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xchg_seq_cst
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 5)
+
+define void @atomic32_add_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_add_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 5)
+
+define void @atomic32_sub_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_sub_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 5)
+
+define void @atomic32_and_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_and_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 5)
+
+define void @atomic32_or_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_or_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xor_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xor_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 5)
+
+define void @atomic32_cas_monotonic(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 monotonic
+  ret void
+}
+; CHECK: atomic32_cas_monotonic
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 0)
+
+define void @atomic32_cas_acquire(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acquire
+  ret void
+}
+; CHECK: atomic32_cas_acquire
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 2)
+
+define void @atomic32_cas_release(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 release
+  ret void
+}
+; CHECK: atomic32_cas_release
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 3)
+
+define void @atomic32_cas_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acq_rel
+  ret void
+}
+; CHECK: atomic32_cas_acq_rel
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 4)
+
+define void @atomic32_cas_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 seq_cst
+  ret void
+}
+; CHECK: atomic32_cas_seq_cst
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 5)
 
 define i64 @atomic64_load_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -200,7 +1040,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_unordered
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -208,7 +1048,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_monotonic
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_acquire(i64* %a) nounwind uwtable {
 entry:
@@ -216,7 +1056,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_acquire
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 4)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 2)
 
 define i64 @atomic64_load_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -224,7 +1064,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_seq_cst
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 32)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 5)
 
 define void @atomic64_store_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -232,7 +1072,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_unordered
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -240,7 +1080,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_monotonic
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_release(i64* %a) nounwind uwtable {
 entry:
@@ -248,7 +1088,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_release
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 8)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 3)
 
 define void @atomic64_store_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -256,7 +1096,287 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_seq_cst
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 32)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xchg_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xchg_monotonic
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 0)
+
+define void @atomic64_add_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_add_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 0)
+
+define void @atomic64_sub_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_sub_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 0)
+
+define void @atomic64_and_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_and_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 0)
+
+define void @atomic64_or_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_or_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xor_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xor_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xchg_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xchg_acquire
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 2)
+
+define void @atomic64_add_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_add_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 2)
+
+define void @atomic64_sub_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_sub_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 2)
+
+define void @atomic64_and_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_and_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 2)
+
+define void @atomic64_or_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_or_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xor_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xor_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xchg_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xchg_release
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 3)
+
+define void @atomic64_add_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_add_release
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 3)
+
+define void @atomic64_sub_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_sub_release
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 3)
+
+define void @atomic64_and_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_and_release
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 3)
+
+define void @atomic64_or_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_or_release
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xor_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xor_release
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xchg_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xchg_acq_rel
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 4)
+
+define void @atomic64_add_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_add_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 4)
+
+define void @atomic64_sub_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_sub_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 4)
+
+define void @atomic64_and_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_and_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 4)
+
+define void @atomic64_or_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_or_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xor_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xor_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xchg_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xchg_seq_cst
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 5)
+
+define void @atomic64_add_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_add_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 5)
+
+define void @atomic64_sub_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_sub_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 5)
+
+define void @atomic64_and_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_and_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 5)
+
+define void @atomic64_or_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_or_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xor_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xor_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 5)
+
+define void @atomic64_cas_monotonic(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 monotonic
+  ret void
+}
+; CHECK: atomic64_cas_monotonic
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 0)
+
+define void @atomic64_cas_acquire(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acquire
+  ret void
+}
+; CHECK: atomic64_cas_acquire
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 2)
+
+define void @atomic64_cas_release(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 release
+  ret void
+}
+; CHECK: atomic64_cas_release
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 3)
+
+define void @atomic64_cas_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acq_rel
+  ret void
+}
+; CHECK: atomic64_cas_acq_rel
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 4)
+
+define void @atomic64_cas_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 seq_cst
+  ret void
+}
+; CHECK: atomic64_cas_seq_cst
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 5)
 
 define i128 @atomic128_load_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -264,7 +1384,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_unordered
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -272,7 +1392,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_monotonic
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_acquire(i128* %a) nounwind uwtable {
 entry:
@@ -280,7 +1400,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_acquire
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 4)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 2)
 
 define i128 @atomic128_load_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -288,7 +1408,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_seq_cst
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 32)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 5)
 
 define void @atomic128_store_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -296,7 +1416,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_unordered
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -304,7 +1424,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_monotonic
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_release(i128* %a) nounwind uwtable {
 entry:
@@ -312,7 +1432,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_release
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 8)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 3)
 
 define void @atomic128_store_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -320,4 +1440,348 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_seq_cst
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 32)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xchg_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xchg_monotonic
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 0)
+
+define void @atomic128_add_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_add_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 0)
+
+define void @atomic128_sub_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_sub_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 0)
+
+define void @atomic128_and_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_and_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 0)
+
+define void @atomic128_or_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_or_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xor_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xor_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xchg_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xchg_acquire
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 2)
+
+define void @atomic128_add_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_add_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 2)
+
+define void @atomic128_sub_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_sub_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 2)
+
+define void @atomic128_and_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_and_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 2)
+
+define void @atomic128_or_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_or_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xor_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xor_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xchg_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xchg_release
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 3)
+
+define void @atomic128_add_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_add_release
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 3)
+
+define void @atomic128_sub_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_sub_release
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 3)
+
+define void @atomic128_and_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_and_release
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 3)
+
+define void @atomic128_or_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_or_release
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xor_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xor_release
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xchg_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xchg_acq_rel
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 4)
+
+define void @atomic128_add_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_add_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 4)
+
+define void @atomic128_sub_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_sub_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 4)
+
+define void @atomic128_and_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_and_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 4)
+
+define void @atomic128_or_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_or_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xor_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xor_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xchg_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xchg_seq_cst
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 5)
+
+define void @atomic128_add_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_add_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 5)
+
+define void @atomic128_sub_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_sub_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 5)
+
+define void @atomic128_and_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_and_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 5)
+
+define void @atomic128_or_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_or_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xor_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xor_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 5)
+
+define void @atomic128_cas_monotonic(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 monotonic
+  ret void
+}
+; CHECK: atomic128_cas_monotonic
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 0)
+
+define void @atomic128_cas_acquire(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acquire
+  ret void
+}
+; CHECK: atomic128_cas_acquire
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 2)
+
+define void @atomic128_cas_release(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 release
+  ret void
+}
+; CHECK: atomic128_cas_release
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 3)
+
+define void @atomic128_cas_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acq_rel
+  ret void
+}
+; CHECK: atomic128_cas_acq_rel
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 4)
+
+define void @atomic128_cas_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 seq_cst
+  ret void
+}
+; CHECK: atomic128_cas_seq_cst
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 5)
+
+define void @atomic_signal_fence_acquire() nounwind uwtable {
+entry:
+  fence singlethread acquire
+  ret void
+}
+; CHECK: atomic_signal_fence_acquire
+; CHECK: call void @__tsan_atomic_signal_fence(i32 2)
+
+define void @atomic_thread_fence_acquire() nounwind uwtable {
+entry:
+  fence  acquire
+  ret void
+}
+; CHECK: atomic_thread_fence_acquire
+; CHECK: call void @__tsan_atomic_thread_fence(i32 2)
+
+define void @atomic_signal_fence_release() nounwind uwtable {
+entry:
+  fence singlethread release
+  ret void
+}
+; CHECK: atomic_signal_fence_release
+; CHECK: call void @__tsan_atomic_signal_fence(i32 3)
+
+define void @atomic_thread_fence_release() nounwind uwtable {
+entry:
+  fence  release
+  ret void
+}
+; CHECK: atomic_thread_fence_release
+; CHECK: call void @__tsan_atomic_thread_fence(i32 3)
+
+define void @atomic_signal_fence_acq_rel() nounwind uwtable {
+entry:
+  fence singlethread acq_rel
+  ret void
+}
+; CHECK: atomic_signal_fence_acq_rel
+; CHECK: call void @__tsan_atomic_signal_fence(i32 4)
+
+define void @atomic_thread_fence_acq_rel() nounwind uwtable {
+entry:
+  fence  acq_rel
+  ret void
+}
+; CHECK: atomic_thread_fence_acq_rel
+; CHECK: call void @__tsan_atomic_thread_fence(i32 4)
+
+define void @atomic_signal_fence_seq_cst() nounwind uwtable {
+entry:
+  fence singlethread seq_cst
+  ret void
+}
+; CHECK: atomic_signal_fence_seq_cst
+; CHECK: call void @__tsan_atomic_signal_fence(i32 5)
+
+define void @atomic_thread_fence_seq_cst() nounwind uwtable {
+entry:
+  fence  seq_cst
+  ret void
+}
+; CHECK: atomic_thread_fence_seq_cst
+; CHECK: call void @__tsan_atomic_thread_fence(i32 5)
diff --git a/test/MC/ARM/arm-arithmetic-aliases.s b/test/MC/ARM/arm-arithmetic-aliases.s
index 9895cfc02b252..3ed444858146b 100644
--- a/test/MC/ARM/arm-arithmetic-aliases.s
+++ b/test/MC/ARM/arm-arithmetic-aliases.s
@@ -124,3 +124,7 @@ bicseq r2, r3
 @ CHECK: bicseq r2, r2, #6              @ encoding: [0x06,0x20,0xd2,0x03]
 @ CHECK: bicseq r2, r2, r3              @ encoding: [0x03,0x20,0xd2,0x01]
 @ CHECK: bicseq r2, r2, r3              @ encoding: [0x03,0x20,0xd2,0x01]
+
+add r0, pc, #123
+
+@ CHECK: adr	r0, #123                @ encoding: [0x7b,0x00,0x8f,0xe2]
diff --git a/test/MC/ARM/arm-shift-encoding.s b/test/MC/ARM/arm-shift-encoding.s
new file mode 100644
index 0000000000000..3c57b67f6e3ba
--- /dev/null
+++ b/test/MC/ARM/arm-shift-encoding.s
@@ -0,0 +1,119 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7 -show-encoding < %s | FileCheck %s
+
+	ldr r0, [r0, r0]
+	ldr r0, [r0, r0, lsr #32]
+	ldr r0, [r0, r0, lsr #16]
+	ldr r0, [r0, r0, lsl #0]
+	ldr r0, [r0, r0, lsl #16]
+	ldr r0, [r0, r0, asr #32]
+	ldr r0, [r0, r0, asr #16]
+	ldr r0, [r0, r0, rrx]
+	ldr r0, [r0, r0, ror #16]
+
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x90,0xe7]
+
+	pld [r0, r0]
+	pld [r0, r0, lsr #32]
+	pld [r0, r0, lsr #16]
+	pld [r0, r0, lsl #0]
+	pld [r0, r0, lsl #16]
+	pld [r0, r0, asr #32]
+	pld [r0, r0, asr #16]
+	pld [r0, r0, rrx]
+	pld [r0, r0, ror #16]
+
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #32] @ encoding: [0x20,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #16] @ encoding: [0x20,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsl #16] @ encoding: [0x00,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #32] @ encoding: [0x40,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #16] @ encoding: [0x40,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, rrx]     @ encoding: [0x60,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, ror #16] @ encoding: [0x60,0xf8,0xd0,0xf7]
+
+	str r0, [r0, r0]
+	str r0, [r0, r0, lsr #32]
+	str r0, [r0, r0, lsr #16]
+	str r0, [r0, r0, lsl #0]
+	str r0, [r0, r0, lsl #16]
+	str r0, [r0, r0, asr #32]
+	str r0, [r0, r0, asr #16]
+	str r0, [r0, r0, rrx]
+	str r0, [r0, r0, ror #16]
+
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x80,0xe7]
+
+@ Uses printAddrMode2OffsetOperand(), used by LDRBT_POST_IMM LDRBT_POST_REG
+@ LDRB_POST_IMM LDRB_POST_REG LDRT_POST_IMM LDRT_POST_REG LDR_POST_IMM
+@ LDR_POST_REG STRBT_POST_IMM STRBT_POST_REG STRB_POST_IMM STRB_POST_REG
+@ STRT_POST_IMM STRT_POST_REG STR_POST_IMM STR_POST_REG
+
+	ldr r0, [r1], r2, rrx
+	ldr r3, [r4], r5, ror #0
+	str r6, [r7], r8, lsl #0
+	str r9, [r10], r11
+
+@ CHECK: ldr r0, [r1], r2, rrx    @ encoding: [0x62,0x00,0x91,0xe6]
+@ CHECK: ldr r3, [r4], r5         @ encoding: [0x05,0x30,0x94,0xe6]
+@ CHECK: str r6, [r7], r8         @ encoding: [0x08,0x60,0x87,0xe6]
+@ CHECK: str r9, [r10], r11       @ encoding: [0x0b,0x90,0x8a,0xe6]
+
+@ Uses printSORegImmOperand(), used by ADCrsi ADDrsi ANDrsi BICrsi EORrsi
+@ ORRrsi RSBrsi RSCrsi SBCrsi SUBrsi CMNzrsi CMPrsi MOVsi MVNsi TEQrsi TSTrsi
+
+	adc sp, lr, pc
+	adc r1, r8, r9, lsr #32
+	adc r2, r7, pc, lsr #16
+	adc r3, r6, r10, lsl #0
+	adc r4, r5, lr, lsl #16
+	adc r5, r4, r11, asr #32
+	adc r6, r3, sp, asr #16
+	adc r7, r2, r12, rrx
+	adc r8, r1, r0, ror #16
+
+@ CHECK: adc sp, lr, pc           @ encoding: [0x0f,0xd0,0xae,0xe0]
+@ CHECK: adc r1, r8, r9, lsr #32  @ encoding: [0x29,0x10,0xa8,0xe0]
+@ CHECK: adc r2, r7, pc, lsr #16  @ encoding: [0x2f,0x28,0xa7,0xe0]
+@ CHECK: adc r3, r6, r10          @ encoding: [0x0a,0x30,0xa6,0xe0]
+@ CHECK: adc r4, r5, lr, lsl #16  @ encoding: [0x0e,0x48,0xa5,0xe0]
+@ CHECK: adc r5, r4, r11, asr #32 @ encoding: [0x4b,0x50,0xa4,0xe0]
+@ CHECK: adc r6, r3, sp, asr #16  @ encoding: [0x4d,0x68,0xa3,0xe0]
+@ CHECK: adc r7, r2, r12, rrx     @ encoding: [0x6c,0x70,0xa2,0xe0]
+@ CHECK: adc r8, r1, r0, ror #16  @ encoding: [0x60,0x88,0xa1,0xe0]
+
+	cmp sp, lr
+	cmp r1, r8, lsr #32
+	cmp r2, r7, lsr #16
+	cmp r3, r6, lsl #0
+	cmp r4, r5, lsl #16
+	cmp r5, r4, asr #32
+	cmp r6, r3, asr #16
+	cmp r7, r2, rrx
+	cmp r8, r1, ror #16
+
+@ CHECK: cmp sp, lr           @ encoding: [0x0e,0x00,0x5d,0xe1]
+@ CHECK: cmp r1, r8, lsr #32  @ encoding: [0x28,0x00,0x51,0xe1]
+@ CHECK: cmp r2, r7, lsr #16  @ encoding: [0x27,0x08,0x52,0xe1]
+@ CHECK: cmp r3, r6           @ encoding: [0x06,0x00,0x53,0xe1]
+@ CHECK: cmp r4, r5, lsl #16  @ encoding: [0x05,0x08,0x54,0xe1]
+@ CHECK: cmp r5, r4, asr #32  @ encoding: [0x44,0x00,0x55,0xe1]
+@ CHECK: cmp r6, r3, asr #16  @ encoding: [0x43,0x08,0x56,0xe1]
+@ CHECK: cmp r7, r2, rrx      @ encoding: [0x62,0x00,0x57,0xe1]
+@ CHECK: cmp r8, r1, ror #16  @ encoding: [0x61,0x08,0x58,0xe1]
diff --git a/test/MC/ARM/basic-thumb-instructions.s b/test/MC/ARM/basic-thumb-instructions.s
index 4ee34ce6b4c81..22e21da88e40d 100644
--- a/test/MC/ARM/basic-thumb-instructions.s
+++ b/test/MC/ARM/basic-thumb-instructions.s
@@ -259,8 +259,8 @@ _func:
 
 @ CHECK: ldr	r1, _foo                @ encoding: [A,0x49]
              @   fixup A - offset: 0, value: _foo, kind: fixup_arm_thumb_cp
-@ CHECK: ldr     r3, #604                @ encoding: [0x97,0x4b]
-@ CHECK: ldr     r3, #368                @ encoding: [0x5c,0x4b]
+@ CHECK: ldr     r3, [pc, #604]         @ encoding: [0x97,0x4b]
+@ CHECK: ldr     r3, [pc, #368]         @ encoding: [0x5c,0x4b]
 
 @------------------------------------------------------------------------------
 @ LDR (register)
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 499e0550135e5..d65cfd7a67a56 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -47,7 +47,47 @@
 @ CHECK-ERRORS: error: immediate shift value out of range
 @ CHECK-ERRORS:         adc r4, r5, r6, ror #32
 
+        @ Out of range shift immediate values for load/store.
+        str r1, [r2, r3, lsl #invalid]
+        ldr r4, [r5], r6, lsl #-1
+        pld r4, [r5, r6, lsl #32]
+        str r4, [r5], r6, lsr #-1
+        ldr r4, [r5, r6, lsr #33]
+        pld r4, [r5, r6, asr #-1]
+        str r4, [r5, r6, asr #33]
+        ldr r4, [r5, r6, ror #-1]
+        pld r4, [r5, r6, ror #32]
+        pld r4, [r5, r6, rrx #0]
 
+@ CHECK-ERRORS: error: shift amount must be an immediate
+@ CHECK-ERRORS:         str r1, [r2, r3, lsl #invalid]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5], r6, lsl #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, lsl #32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5], r6, lsr #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, lsr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, asr #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5, r6, asr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, ror #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, ror #32]
+@ CHECK-ERRORS: error: ']' expected
+@ CHECK-ERRORS:         pld r4, [r5, r6, rrx #0]
+        
         @ Out of range 16-bit immediate on BKPT
         bkpt #65536
 
@@ -321,3 +361,13 @@
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS:         cps f,#1
 @ CHECK-ERRORS:               ^
+
+        @ Bad operands for msr
+        msr #0, #0
+        msr foo, #0
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         msr #0, #0
+@ CHECK-ERRORS:             ^
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         msr foo, #0
+@ CHECK-ERRORS:             ^
diff --git a/test/MC/ARM/elf-jump24-fixup.s b/test/MC/ARM/elf-jump24-fixup.s
new file mode 100644
index 0000000000000..75a4b869dc608
--- /dev/null
+++ b/test/MC/ARM/elf-jump24-fixup.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc %s -triple=thumbv7-linux-gnueabi -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+	.syntax unified
+	.text
+	.code	16
+	.thumb_func
+foo:
+	b.w	bar
+
+@ CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 bar
diff --git a/test/MC/ARM/thumb-shift-encoding.s b/test/MC/ARM/thumb-shift-encoding.s
new file mode 100644
index 0000000000000..54284132b6534
--- /dev/null
+++ b/test/MC/ARM/thumb-shift-encoding.s
@@ -0,0 +1,45 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7 -show-encoding < %s | FileCheck %s
+
+@ Uses printT2SOOperand(), used by t2ADCrs t2ADDrs t2ANDrs t2BICrs t2EORrs
+@ t2ORNrs t2ORRrs t2RSBrs t2SBCrs t2SUBrs t2CMNzrs t2CMPrs t2MOVSsi t2MOVsi
+@ t2MVNs t2TEQrs t2TSTrs
+
+	sbc.w r12, lr, r0
+	sbc.w r1, r8, r9, lsr #32
+	sbc.w r2, r7, pc, lsr #16
+	sbc.w r3, r6, r10, lsl #0
+	sbc.w r4, r5, lr, lsl #16
+	sbc.w r5, r4, r11, asr #32
+	sbc.w r6, r3, sp, asr #16
+	sbc.w r7, r2, r12, rrx
+	sbc.w r8, r1, r0, ror #16
+
+@ CHECK: sbc.w r12, lr, r0          @ encoding: [0x6e,0xeb,0x00,0x0c]
+@ CHECK: sbc.w r1, r8, r9, lsr #32  @ encoding: [0x68,0xeb,0x19,0x01]
+@ CHECK: sbc.w r2, r7, pc, lsr #16  @ encoding: [0x67,0xeb,0x1f,0x42]
+@ CHECK: sbc.w r3, r6, r10          @ encoding: [0x66,0xeb,0x0a,0x03]
+@ CHECK: sbc.w r4, r5, lr, lsl #16  @ encoding: [0x65,0xeb,0x0e,0x44]
+@ CHECK: sbc.w r5, r4, r11, asr #32 @ encoding: [0x64,0xeb,0x2b,0x05]
+@ CHECK: sbc.w r6, r3, sp, asr #16  @ encoding: [0x63,0xeb,0x2d,0x46]
+@ CHECK: sbc.w r7, r2, r12, rrx     @ encoding: [0x62,0xeb,0x3c,0x07]
+@ CHECK: sbc.w r8, r1, r0, ror #16  @ encoding: [0x61,0xeb,0x30,0x48]
+
+	and.w r12, lr, r0
+	and.w r1, r8, r9, lsr #32
+	and.w r2, r7, pc, lsr #16
+	and.w r3, r6, r10, lsl #0
+	and.w r4, r5, lr, lsl #16
+	and.w r5, r4, r11, asr #32
+	and.w r6, r3, sp, asr #16
+	and.w r7, r2, r12, rrx
+	and.w r8, r1, r0, ror #16
+
+@ CHECK: and.w r12, lr, r0          @ encoding: [0x0e,0xea,0x00,0x0c]
+@ CHECK: and.w r1, r8, r9, lsr #32  @ encoding: [0x08,0xea,0x19,0x01]
+@ CHECK: and.w r2, r7, pc, lsr #16  @ encoding: [0x07,0xea,0x1f,0x42]
+@ CHECK: and.w r3, r6, r10          @ encoding: [0x06,0xea,0x0a,0x03]
+@ CHECK: and.w r4, r5, lr, lsl #16  @ encoding: [0x05,0xea,0x0e,0x44]
+@ CHECK: and.w r5, r4, r11, asr #32 @ encoding: [0x04,0xea,0x2b,0x05]
+@ CHECK: and.w r6, r3, sp, asr #16  @ encoding: [0x03,0xea,0x2d,0x46]
+@ CHECK: and.w r7, r2, r12, rrx     @ encoding: [0x02,0xea,0x3c,0x07]
+@ CHECK: and.w r8, r1, r0, ror #16  @ encoding: [0x01,0xea,0x30,0x48]
diff --git a/test/MC/ARM/thumb2-b.w-encodingT4.s b/test/MC/ARM/thumb2-b.w-encodingT4.s
new file mode 100644
index 0000000000000..be77b06267a23
--- /dev/null
+++ b/test/MC/ARM/thumb2-b.w-encodingT4.s
@@ -0,0 +1,12 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
+  .syntax unified
+  .globl _func
+.thumb_func _foo
+.space 0x37c6
+_foo:
+@------------------------------------------------------------------------------
+@ B (thumb2 b.w encoding T4) rdar://12585795
+@------------------------------------------------------------------------------
+        b.w   0x3680c
+
+@ CHECK: b.w	#223244                    @ encoding: [0x6d,0xf0,0x0c,0xb0]
diff --git a/test/MC/AsmParser/bad-macro.s b/test/MC/AsmParser/bad-macro.s
new file mode 100644
index 0000000000000..313607b7782cd
--- /dev/null
+++ b/test/MC/AsmParser/bad-macro.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2>&1 | FileCheck %s
+
+.macro 23
+
+// CHECK: expected identifier in '.macro' directive
+
+.macro abc 33
+
+// CHECK: expected identifier in '.macro' directive
diff --git a/test/MC/AsmParser/directive_lcomm.s b/test/MC/AsmParser/directive_lcomm.s
index 0a0add513fe98..37a350c82e81f 100644
--- a/test/MC/AsmParser/directive_lcomm.s
+++ b/test/MC/AsmParser/directive_lcomm.s
@@ -1,9 +1,14 @@
 # RUN: llvm-mc -triple i386-apple-darwin10 %s | FileCheck %s
+# RUN: llvm-mc -triple i386-pc-mingw32 %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-linux-gnu %s 2>&1 | FileCheck %s -check-prefix=ERROR
 
 # CHECK: TEST0:
-# CHECK: .zerofill __DATA,__bss,a,7,4
-# CHECK: .zerofill __DATA,__bss,b,8
-# CHECK: .zerofill __DATA,__bss,c,0
+# CHECK: .lcomm a,7,4
+# CHECK: .lcomm b,8
+# CHECK: .lcomm c,0
+
+# ELF doesn't like alignment on .lcomm.
+# ERROR: alignment not supported on this target
 TEST0:  
         .lcomm a, 8-1, 4
         .lcomm b,8
diff --git a/test/MC/AsmParser/labels.s b/test/MC/AsmParser/labels.s
index 56091755d9668..6a9870b655f2f 100644
--- a/test/MC/AsmParser/labels.s
+++ b/test/MC/AsmParser/labels.s
@@ -41,7 +41,7 @@ foo:
 // CHECK: .comm "a 6",1
         .comm "a 6", 1
 
-// CHECK: .zerofill __DATA,__bss,"a 7",1,0
+// CHECK: .lcomm "a 7",1
         .lcomm "a 7", 1
 
 // FIXME: We don't bother to support .lsym.
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
index 6d084213e40b6..3269369be0209 100644
--- a/test/MC/AsmParser/macro-args.s
+++ b/test/MC/AsmParser/macro-args.s
@@ -4,10 +4,18 @@
     movl   \var@GOTOFF(%ebx),\re2g
 .endm
 
+.macro GET_DEFAULT var, re2g=%ebx, re3g=%ecx
+movl 2(\re2g, \re3g, 2), \var
+.endm
+
+GET         is_sse, %eax
+// CHECK: movl  is_sse@GOTOFF(%ebx), %eax
 
-GET    is_sse, %eax
+GET_DEFAULT %ebx, , %edx
+// CHECK: movl  2(%ebx,%edx,2), %ebx
 
-// CHECK: movl	is_sse@GOTOFF(%ebx), %eax
+GET_DEFAULT %ebx, %edx
+// CHECK: movl  2(%edx,%ecx,2), %ebx
 
 .macro bar
     .long $n
diff --git a/test/MC/AsmParser/macro-rept-err1.s b/test/MC/AsmParser/macro-rept-err1.s
index db92856a1d6d2..cfa66878d9793 100644
--- a/test/MC/AsmParser/macro-rept-err1.s
+++ b/test/MC/AsmParser/macro-rept-err1.s
@@ -3,4 +3,4 @@
 
 .endr
 
-// CHECK: unexpected '.endr' directive, no current .rept
+// CHECK: unmatched '.endr' directive
diff --git a/test/MC/AsmParser/macros-darwin.s b/test/MC/AsmParser/macros-darwin.s
new file mode 100644
index 0000000000000..31b9edb378180
--- /dev/null
+++ b/test/MC/AsmParser/macros-darwin.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple i386-apple-darwin10 %s 2> %t.err | FileCheck %s
+
+.macro test1
+.globl "$0 $1 $2 $$3 $n"
+.endmacro
+
+// CHECK: .globl "1 23  $3 2"
+test1 1, 2 3
+
diff --git a/test/MC/AsmParser/macros.s b/test/MC/AsmParser/macros.s
index 2957592992389..b1cb851fcd6b9 100644
--- a/test/MC/AsmParser/macros.s
+++ b/test/MC/AsmParser/macros.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
+// RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t.err | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERRORS %s < %t.err
 
 .macro .test0
@@ -28,12 +28,66 @@ test2 10
 .globl "$0 $1 $2 $$3 $n"
 .endmacro
 
-// CHECK: .globl	"1 23  $3 2"
-test3 1,2 3
+// CHECK: .globl	"1 (23)  $3 2"
+test3 1, (2 3)
+
+// CHECK: .globl "1 2  $3 2"
+test3 1 2
 
 .macro test4
 .globl "$0 -- $1"
 .endmacro
 
-// CHECK: .globl	"ab)(,) -- (cd)"
-test4 a b)(,),(cd)
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+.macro test5 _a
+.globl "\_a"
+.endm
+
+// CHECK: .globl zed1
+test5 zed1
+
+.macro test6 $a
+.globl "\$a"
+.endm
+
+// CHECK: .globl zed2
+test6 zed2
+
+.macro test7 .a
+.globl "\.a"
+.endm
+
+// CHECK: .globl zed3
+test7 zed3
+
+.macro test8 _a, _b, _c
+.globl "\_a,\_b,\_c"
+.endmacro
+
+.macro test9 _a _b _c
+.globl "\_a \_b \_c"
+.endmacro
+
+// CHECK: .globl  "a,b,c"
+test8 a, b, c
+// CHECK: .globl  "%1,%2,%3"
+test8 %1 %2 %3 #a comment
+// CHECK: .globl "x-y,z,1"
+test8 x - y z 1
+// CHECK: .globl  "1 2 3"
+test9 1, 2,3
+
+test8 1,2 3
+// CHECK-ERRORS: error: macro argument '_c' is missing
+// CHECK-ERRORS-NEXT: test8 1,2 3
+// CHECK-ERRORS-NEXT:           ^
+
+test8 1 2, 3
+// CHECK-ERRORS: error: expected ' ' for macro argument separator
+// CHECK-ERRORS-NEXT:test8 1 2, 3
+// CHECK-ERRORS-NEXT:         ^
diff --git a/test/MC/COFF/comm.ll b/test/MC/COFF/comm.ll
new file mode 100644
index 0000000000000..74da557fb5cce
--- /dev/null
+++ b/test/MC/COFF/comm.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
+
+@a = internal global i8 0, align 1
+@b = internal global double 0.000000e+00, align 8
+@c = common global i8 0, align 1
+@d = common global double 0.000000e+00, align 8
+
+; .lcomm uses byte alignment
+; CHECK: .lcomm	_a,1
+; CHECK: .lcomm	_b,8,8
+; .comm uses log2 alignment
+; CHECK: .comm	_c,1,0
+; CHECK: .comm	_d,8,3
diff --git a/test/MC/COFF/global_ctors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 4d6b1c7d99138..2a25219a778cc 100644
--- a/test/MC/COFF/global_ctors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -1,14 +1,16 @@
 ; Test that global ctors are emitted into the proper COFF section for the
 ; target. Mingw uses .ctors, whereas MSVC uses .CRT$XC*.
-; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
-; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
+; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32
+; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32
 
 @.str = private unnamed_addr constant [13 x i8] c"constructing\00", align 1
-@.str2 = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str2 = private unnamed_addr constant [12 x i8] c"destructing\00", align 1
+@.str3 = private unnamed_addr constant [5 x i8] c"main\00", align 1
 
 @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_ctor }]
+@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_dtor }]
 
 declare i32 @puts(i8*)
 
@@ -17,12 +19,21 @@ define void @a_global_ctor() nounwind {
   ret void
 }
 
+define void @a_global_dtor() nounwind {
+  %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
+  ret void
+}
+
 define i32 @main() nounwind {
-  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str2, i32 0, i32 0))
+  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str3, i32 0, i32 0))
   ret i32 0
 }
 
 ; WIN32: .section .CRT$XCU,"r"
 ; WIN32: a_global_ctor
+; WIN32: .section .CRT$XTX,"r"
+; WIN32: a_global_dtor
 ; MINGW32: .section .ctors,"w"
 ; MINGW32: a_global_ctor
+; MINGW32: .section .dtors,"w"
+; MINGW32: a_global_dtor
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
index 5ba7d618bfd7c..00b85264686d2 100644
--- a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
@@ -1,5 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
-# XFAIL: *
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | FileCheck %s
 
 # Opcode=737 Name=VLD1DUPq8_UPD Format=ARM_FORMAT_NLdSt(30)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
@@ -9,3 +8,4 @@
 # 
 # 'a' == 1 and data_size == 8 is invalid
 0x3d 0x3c 0xa0 0xf4
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
new file mode 100644
index 0000000000000..9bb0995ecef8a
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
new file mode 100644
index 0000000000000..84c98bfbcaf48
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0xc0 0x0f
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
new file mode 100644
index 0000000000000..9024b09531cfd
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
new file mode 100644
index 0000000000000..9462812f26d1d
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
new file mode 100644
index 0000000000000..f6e71bcfd65be
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/marked-up-thumb.txt b/test/MC/Disassembler/ARM/marked-up-thumb.txt
new file mode 100644
index 0000000000000..65be28618bac4
--- /dev/null
+++ b/test/MC/Disassembler/ARM/marked-up-thumb.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -mdis < %s | FileCheck %s
+# CHECK: ldr  <reg:r4>, <mem:[pc, <imm:#32>]>
+0x08 0x4c
+# CHECK: push	{<reg:r1>, <reg:r2>, <reg:r7>}
+0x86 0xb4
+# CHECK: sub	<reg:sp>, <imm:#132>
+0xa1 0xb0
diff --git a/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
new file mode 100644
index 0000000000000..e53739e73975d
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0xa0 0xf9 0x00 0x00
+0xa0 0xf9 0x20 0x00
+0xa0 0xf9 0x40 0x00
+0xa0 0xf9 0x60 0x00
+0xa0 0xf9 0x80 0x00
+0xa0 0xf9 0xa0 0x00
+0xa0 0xf9 0xc0 0x00
+0xa0 0xf9 0xe0 0x00
+
+# CHECK: vld1.8  {d0[0]}, [r0], r0 @ encoding: [0xa0,0xf9,0x00,0x00]
+# CHECK: vld1.8  {d0[1]}, [r0], r0 @ encoding: [0xa0,0xf9,0x20,0x00]
+# CHECK: vld1.8  {d0[2]}, [r0], r0 @ encoding: [0xa0,0xf9,0x40,0x00]
+# CHECK: vld1.8  {d0[3]}, [r0], r0 @ encoding: [0xa0,0xf9,0x60,0x00]
+# CHECK: vld1.8  {d0[4]}, [r0], r0 @ encoding: [0xa0,0xf9,0x80,0x00]
+# CHECK: vld1.8  {d0[5]}, [r0], r0 @ encoding: [0xa0,0xf9,0xa0,0x00]
+# CHECK: vld1.8  {d0[6]}, [r0], r0 @ encoding: [0xa0,0xf9,0xc0,0x00]
+# CHECK: vld1.8  {d0[7]}, [r0], r0 @ encoding: [0xa0,0xf9,0xe0,0x00]
+
+0xa0 0xf9 0x00 0x04
+0xa0 0xf9 0x10 0x04
+0xa0 0xf9 0x40 0x04
+0xa0 0xf9 0x50 0x04
+0xa0 0xf9 0x80 0x04
+0xa0 0xf9 0x90 0x04
+0xa0 0xf9 0xc0 0x04
+0xa0 0xf9 0xd0 0x04
+
+# CHECK: vld1.16 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x40,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x50,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x90,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0], r0      @ encoding: [0xa0,0xf9,0xc0,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0xd0,0x04]
+
+0xa0 0xf9 0x00 0x08
+0xa0 0xf9 0x30 0x08
+0xa0 0xf9 0x80 0x08
+0xa0 0xf9 0xb0 0x08
+
+# CHECK: vld1.32 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x08]
+# CHECK: vld1.32 {d0[0]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0x30,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0xb0,0x08]
+
+0xa0 0xf9 0x1f 0x04
+0xa0 0xf9 0x8f 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16] @ encoding: [0xa0,0xf9,0x1f,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]      @ encoding: [0xa0,0xf9,0x8f,0x00]
+
+0xa0 0xf9 0x1d 0x04
+0xa0 0xf9 0x8d 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16]! @ encoding: [0xa0,0xf9,0x1d,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]!      @ encoding: [0xa0,0xf9,0x8d,0x00]
+
+0xa5 0xf9 0x10 0x04
+0xa5 0xf9 0x1a 0x04
+0xae 0xf9 0x1a 0x04
+0xa5 0xf9 0x1a 0x94
+
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0xa5,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d0[0]}, [lr, :16], r10 @ encoding: [0xae,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x94]
+
+0xa0 0xf9 0x20 0x0b
+0xa0 0xf9 0x20 0x07
+0xa0 0xf9 0x20 0x03
+
+# CHECK: vld4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0, :128], r0 @ encoding: [0xa0,0xf9,0x20,0x0b]
+# CHECK: vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x07]
+# CHECK: vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x03]
diff --git a/test/MC/Disassembler/ARM/neont-VST-reencoding.txt b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
new file mode 100644
index 0000000000000..eb3722c08531e
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0x80 0xf9 0x00 0x00
+0x81 0xf9 0x21 0x10
+0x81 0xf9 0x42 0x10
+0x81 0xf9 0x61 0x20
+0x82 0xf9 0x82 0x20
+0x82 0xf9 0xa1 0x10
+0x82 0xf9 0xc2 0x20
+0x83 0xf9 0xe3 0x30
+
+# CHECK: vst1.8  {d0[0]}, [r0], r0 @ encoding: [0x80,0xf9,0x00,0x00]
+# CHECK: vst1.8  {d1[1]}, [r1], r1 @ encoding: [0x81,0xf9,0x21,0x10]
+# CHECK: vst1.8  {d1[2]}, [r1], r2 @ encoding: [0x81,0xf9,0x42,0x10]
+# CHECK: vst1.8  {d2[3]}, [r1], r1 @ encoding: [0x81,0xf9,0x61,0x20]
+# CHECK: vst1.8  {d2[4]}, [r2], r2 @ encoding: [0x82,0xf9,0x82,0x20]
+# CHECK: vst1.8  {d1[5]}, [r2], r1 @ encoding: [0x82,0xf9,0xa1,0x10]
+# CHECK: vst1.8  {d2[6]}, [r2], r2 @ encoding: [0x82,0xf9,0xc2,0x20]
+# CHECK: vst1.8  {d3[7]}, [r3], r3 @ encoding: [0x83,0xf9,0xe3,0x30]
+
+0x80 0xf9 0x00 0x04
+0xc3 0xf9 0x13 0x04
+0xc4 0xf9 0x43 0x04
+0xc5 0xf9 0x55 0x04
+0xc6 0xf9 0x85 0x04
+0xc7 0xf9 0x95 0x74
+0xc8 0xf9 0xc7 0x84
+0xc9 0xf9 0xd9 0x94
+
+# CHECK: vst1.16 {d0[0]},  [r0], r0      @ encoding: [0x80,0xf9,0x00,0x04]
+# CHECK: vst1.16 {d16[0]}, [r3, :16], r3 @ encoding: [0xc3,0xf9,0x13,0x04]
+# CHECK: vst1.16 {d16[1]}, [r4], r3      @ encoding: [0xc4,0xf9,0x43,0x04]
+# CHECK: vst1.16 {d16[1]}, [r5, :16], r5 @ encoding: [0xc5,0xf9,0x55,0x04]
+# CHECK: vst1.16 {d16[2]}, [r6], r5      @ encoding: [0xc6,0xf9,0x85,0x04]
+# CHECK: vst1.16 {d23[2]}, [r7, :16], r5 @ encoding: [0xc7,0xf9,0x95,0x74]
+# CHECK: vst1.16 {d24[3]}, [r8], r7      @ encoding: [0xc8,0xf9,0xc7,0x84]
+# CHECK: vst1.16 {d25[3]}, [r9, :16], r9 @ encoding: [0xc9,0xf9,0xd9,0x94]
+
+0x8a 0xf9 0x01 0xa8
+0xcb 0xf9 0x32 0x18
+0x8c 0xf9 0x83 0xb8
+0xcd 0xf9 0xb4 0x28
+
+# CHECK: vst1.32 {d10[0]}, [r10], r1      @ encoding: [0x8a,0xf9,0x01,0xa8]
+# CHECK: vst1.32 {d17[0]}, [r11, :32], r2 @ encoding: [0xcb,0xf9,0x32,0x18]
+# CHECK: vst1.32 {d11[1]}, [r12], r3      @ encoding: [0x8c,0xf9,0x83,0xb8]
+# CHECK: vst1.32 {d18[1]}, [sp, :32], r4  @ encoding: [0xcd,0xf9,0xb4,0x28]
+
+0x81 0xf9 0x1f 0x44
+0x82 0xf9 0x8f 0x30
+
+# CHECK: vst1.16 {d4[0]}, [r1, :16] @ encoding: [0x81,0xf9,0x1f,0x44]
+# CHECK: vst1.8  {d3[4]}, [r2]      @ encoding: [0x82,0xf9,0x8f,0x30]
+
+0x83 0xf9 0x1d 0x24
+0x84 0xf9 0x8d 0x10
+
+# CHECK: vst1.16 {d2[0]}, [r3, :16]! @ encoding: [0x83,0xf9,0x1d,0x24]
+# CHECK: vst1.8  {d1[4]}, [r4]!      @ encoding: [0x84,0xf9,0x8d,0x10]
+
+0x85 0xf9 0x10 0x04
+0x85 0xf9 0x1a 0x74
+0x8e 0xf9 0x1a 0x84
+0x85 0xf9 0x1a 0x94
+
+# CHECK: vst1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0x85,0xf9,0x10,0x04]
+# CHECK: vst1.16 {d7[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x74]
+# CHECK: vst1.16 {d8[0]}, [lr, :16], r10 @ encoding: [0x8e,0xf9,0x1a,0x84]
+# CHECK: vst1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x94]
+
+0x81 0xf9 0x24 0x0b
+0x82 0xf9 0x25 0x07
+0x83 0xf9 0x26 0x03
+
+# CHECK: vst4.32 {d0[0], d1[0], d2[0], d3[0]}, [r1, :128], r4 @ encoding: [0x81,0xf9,0x24,0x0b]
+# CHECK: vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r2], r5       @ encoding: [0x82,0xf9,0x25,0x07]
+# CHECK: vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r3], r6       @ encoding: [0x83,0xf9,0x26,0x03]
diff --git a/test/MC/Disassembler/ARM/thumb-printf.txt b/test/MC/Disassembler/ARM/thumb-printf.txt
index 8158a73edcb5f..ca820444adc76 100644
--- a/test/MC/Disassembler/ARM/thumb-printf.txt
+++ b/test/MC/Disassembler/ARM/thumb-printf.txt
@@ -7,17 +7,17 @@
 # CHECK-NEXT:	add	r3, sp, #20
 # CHECK-NEXT:	ldr	r5, [r3], #4
 # CHECK-NEXT:	str	r3, [sp]
-# CHECK-NEXT:	ldr	r3, #52
+# CHECK-NEXT:	ldr	r3, [pc, #52]
 # CHECK-NEXT:	add	r3, pc
 # CHECK-NEXT:	ldr	r0, [r3]
 # CHECK-NEXT:	ldr	r4, [r0]
-# CHECK-NEXT:	ldr	r0, #48
+# CHECK-NEXT:	ldr	r0, [pc, #48]
 # CHECK-NEXT:	add	r0, pc
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	blx	#191548
 # CHECK-NEXT:	cbnz	r0, #6
-# CHECK-NEXT:	ldr	r1, #40
+# CHECK-NEXT:	ldr	r1, [pc, #40]
 # CHECK-NEXT:	add	r1, pc
 # CHECK-NEXT:	ldr	r1, [r1]
 # CHECK-NEXT:	b	#0
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index c08585a371976..757ce6e3977bd 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -30,7 +30,7 @@
 # CHECK:	ldm	r0!, {r1}
 0x02 0xc8
 
-# CHECK:	ldr	r5, #432
+# CHECK:	ldr	r5, [pc, #432]
 0x6c 0x4d
 
 # CHECK:	str	r0, [r3]
diff --git a/test/MC/Disassembler/ARM/thumb1.txt b/test/MC/Disassembler/ARM/thumb1.txt
index 5b7026231096c..de9596aab732c 100644
--- a/test/MC/Disassembler/ARM/thumb1.txt
+++ b/test/MC/Disassembler/ARM/thumb1.txt
@@ -160,6 +160,7 @@
 # CHECK: ldr r1, [sp]
 # CHECK: ldr r2, [sp, #24]
 # CHECK: ldr r3, [sp, #1020]
+# CHECK: ldr r1, [pc, #12]
 
 
 0x29 0x68
@@ -168,6 +169,7 @@
 0x00 0x99
 0x06 0x9a
 0xff 0x9b
+0x03 0x49
 
 #------------------------------------------------------------------------------
 # LDR (register)
diff --git a/test/MC/Disassembler/ARM/thumb2.txt b/test/MC/Disassembler/ARM/thumb2.txt
index 42ebe58207b3c..45dace3b09c5e 100644
--- a/test/MC/Disassembler/ARM/thumb2.txt
+++ b/test/MC/Disassembler/ARM/thumb2.txt
@@ -169,6 +169,9 @@
 
 0x13 0xf5 0xce 0xa9
 
+# CHECK: b.w   #208962
+
+0x33 0xf0 0x21 0xb8 # rdar://12585795
 
 #------------------------------------------------------------------------------
 # BFC
diff --git a/test/MC/Disassembler/Mips/mips64.txt b/test/MC/Disassembler/Mips/mips64.txt
index 095ed181ba819..0a88c40839fa7 100644
--- a/test/MC/Disassembler/Mips/mips64.txt
+++ b/test/MC/Disassembler/Mips/mips64.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x03 0xc1 0x08 0x17
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x3c 0x01 0x00 0x01
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xac 0x3a 0xc4 0xc9
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64_le.txt b/test/MC/Disassembler/Mips/mips64_le.txt
index c4e5591da4f95..fe8faffa83353 100644
--- a/test/MC/Disassembler/Mips/mips64_le.txt
+++ b/test/MC/Disassembler/Mips/mips64_le.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x2d 0xd0 0x2b 0x00
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x17 0x08 0xc1 0x03
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x01 0x00 0x01 0x3c
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xc9 0xc4 0x3a 0xac
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64r2.txt b/test/MC/Disassembler/Mips/mips64r2.txt
index 41808c724e247..2dfde0d231c60 100644
--- a/test/MC/Disassembler/Mips/mips64r2.txt
+++ b/test/MC/Disassembler/Mips/mips64r2.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x03 0xc1 0x08 0x17
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x3c 0x01 0x00 0x01
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xac 0x3a 0xc4 0xc9
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64r2_le.txt b/test/MC/Disassembler/Mips/mips64r2_le.txt
index 4987f80af9d84..620d9ebe8da32 100644
--- a/test/MC/Disassembler/Mips/mips64r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips64r2_le.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x2d 0xd0 0x2b 0x00
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x17 0x08 0xc1 0x03
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x01 0x00 0x01 0x3c
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xc9 0xc4 0x3a 0xac
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/X86/marked-up.txt b/test/MC/Disassembler/X86/marked-up.txt
new file mode 100644
index 0000000000000..f0e51252f8d8b
--- /dev/null
+++ b/test/MC/Disassembler/X86/marked-up.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --mdis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
+
+# CHECK: movq	<mem:<reg:%gs>:8>, <reg:%rcx>
+0x65 0x48 0x8b 0x0c 0x25 0x08 0x00 0x00 0x00
+# CHECK: xorps	<reg:%xmm1>, <reg:%xmm2>
+0x0f 0x57 0xd1
diff --git a/test/MC/ELF/cfi-reg.s b/test/MC/ELF/cfi-reg.s
new file mode 100644
index 0000000000000..fd68d6d5ad07e
--- /dev/null
+++ b/test/MC/ELF/cfi-reg.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -o - | FileCheck %s
+// PR13754
+
+f:
+	.cfi_startproc
+        nop
+	.cfi_offset 6, -16
+        nop
+	.cfi_offset %rsi, -16
+        nop
+	.cfi_offset rbx, -16
+        nop
+	.cfi_endproc
+
+// CHECK: f:
+// CHECK: .cfi_offset %rbp, -16
+// CHECK: .cfi_offset %rsi, -16
+// CHECK: .cfi_offset %rbx, -16
diff --git a/test/MC/ELF/lcomm.s b/test/MC/ELF/lcomm.s
new file mode 100644
index 0000000000000..ae8d0baa3323d
--- /dev/null
+++ b/test/MC/ELF/lcomm.s
@@ -0,0 +1,21 @@
+// RUN: llvm-mc -triple i386-pc-linux-gnu %s -filetype=obj -o - | elf-dump | FileCheck %s
+
+.lcomm A, 5
+.lcomm B, 32 << 20
+
+// CHECK: (('st_name', 0x00000001) # 'A'
+// CHECK:  ('st_value', 0x00000000)
+// CHECK:  ('st_size', 0x00000005)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
+// CHECK: (('st_name', 0x00000003) # 'B'
+// CHECK:  ('st_value', 0x00000005)
+// CHECK:  ('st_size', 0x02000000)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
diff --git a/test/MC/MachO/ARM/long-call-branch-island-relocation.s b/test/MC/MachO/ARM/long-call-branch-island-relocation.s
new file mode 100644
index 0000000000000..8ee7da54b5416
--- /dev/null
+++ b/test/MC/MachO/ARM/long-call-branch-island-relocation.s
@@ -0,0 +1,43 @@
+@ RUN: llvm-mc -n -triple armv7-apple-darwin10 %s -filetype=obj -o %t.o
+@ RUN: macho-dump --dump-section-data < %t.o | FileCheck %s
+
+@ rdar://12359919
+
+	.syntax unified
+	.text
+
+	.globl	_bar
+	.align	2
+	.code	16
+	.thumb_func	_bar
+_bar:
+	push	{r7, lr}
+	mov	r7, sp
+	bl	_foo
+	pop	{r7, pc}
+
+
+_junk:
+@ Make the _foo symbol sufficiently far away to force the 'bl' relocation
+@ above to be out of range. On Darwin, the assembler deals with this by
+@ generating an external relocation so the linker can create a branch
+@ island.
+
+  .space 20000000
+
+  .section	__TEXT,initcode,regular,pure_instructions
+
+	.globl	_foo
+	.align	2
+	.code	16
+_foo:
+	push	{r7, lr}
+	mov	r7, sp
+	pop	{r7, pc}
+
+
+@ CHECK:  ('_relocations', [
+@ CHECK:    # Relocation 0
+@ CHECK:    (('word-0', 0x4),
+@ CHECK:     ('word-1', 0x6d000002)),
+@ CHECK:  ])
diff --git a/test/MC/MachO/absolute.s b/test/MC/MachO/absolute.s
new file mode 100644
index 0000000000000..784e32a7e41d5
--- /dev/null
+++ b/test/MC/MachO/absolute.s
@@ -0,0 +1,158 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+_bar:
+  nop
+_foo:
+  nop
+
+  .set foo_set1, (_foo + 0xffff0000)
+  .set foo_set2, (_foo - _bar + 0xffff0000)
+
+foo_equals = (_foo + 0xffff0000)
+foo_equals2 = (_foo - _bar + 0xffff0000)
+
+  .globl foo_set1_global;
+  .set foo_set1_global, (_foo + 0xffff0000)
+
+  .globl foo_set2_global;
+  .set foo_set2_global, (_foo - _bar + 0xffff0000)
+
+// CHECK: ('cputype', 16777223)
+// CHECK: ('cpusubtype', 3)
+// CHECK: ('filetype', 1)
+// CHECK: ('num_load_commands', 3)
+// CHECK: ('load_commands_size', 256)
+// CHECK: ('flag', 0)
+// CHECK: ('reserved', 0)
+// CHECK: ('load_commands', [
+// CHECK:   # Load Command 0
+// CHECK:  (('command', 25)
+// CHECK:   ('size', 152)
+// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:   ('vm_addr', 0)
+// CHECK:   ('vm_size', 2)
+// CHECK:   ('file_offset', 288)
+// CHECK:   ('file_size', 2)
+// CHECK:   ('maxprot', 7)
+// CHECK:   ('initprot', 7)
+// CHECK:   ('num_sections', 1)
+// CHECK:   ('flags', 0)
+// CHECK:   ('sections', [
+// CHECK:     # Section 0
+// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 0)
+// CHECK:     ('size', 2)
+// CHECK:     ('offset', 288)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 0)
+// CHECK:     ('num_reloc', 0)
+// CHECK:     ('flags', 0x80000400)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:     ('reserved3', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:   ])
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 1
+// CHECK:  (('command', 2)
+// CHECK:   ('size', 24)
+// CHECK:   ('symoff', 292)
+// CHECK:   ('nsyms', 8)
+// CHECK:   ('stroff', 420)
+// CHECK:   ('strsize', 84)
+// CHECK:   ('_string_data', '\x00foo_set1_global\x00foo_set2_global\x00_bar\x00_foo\x00foo_set1\x00foo_set2\x00foo_equals\x00foo_equals2\x00')
+// CHECK:   ('_symbols', [
+// CHECK:     # Symbol 0
+// CHECK:    (('n_strx', 33)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 0)
+// CHECK:     ('_string', '_bar')
+// CHECK:    ),
+// CHECK:     # Symbol 1
+// CHECK:    (('n_strx', 38)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 1)
+// CHECK:     ('_string', '_foo')
+// CHECK:    ),
+// CHECK:     # Symbol 2
+// CHECK:    (('n_strx', 43)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1')
+// CHECK:    ),
+// CHECK:     # Symbol 3
+// CHECK:    (('n_strx', 52)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2')
+// CHECK:    ),
+// CHECK:     # Symbol 4
+// CHECK:    (('n_strx', 61)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals')
+// CHECK:    ),
+// CHECK:     # Symbol 5
+// CHECK:    (('n_strx', 72)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals2')
+// CHECK:    ),
+// CHECK:     # Symbol 6
+// CHECK:    (('n_strx', 1)
+// CHECK:     ('n_type', 0xf)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1_global')
+// CHECK:    ),
+// CHECK:     # Symbol 7
+// CHECK:    (('n_strx', 17)
+// CHECK:     ('n_type', 0x3)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2_global')
+// CHECK:    ),
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 2
+// CHECK:  (('command', 11)
+// CHECK:   ('size', 80)
+// CHECK:   ('ilocalsym', 0)
+// CHECK:   ('nlocalsym', 6)
+// CHECK:   ('iextdefsym', 6)
+// CHECK:   ('nextdefsym', 2)
+// CHECK:   ('iundefsym', 8)
+// CHECK:   ('nundefsym', 0)
+// CHECK:   ('tocoff', 0)
+// CHECK:   ('ntoc', 0)
+// CHECK:   ('modtaboff', 0)
+// CHECK:   ('nmodtab', 0)
+// CHECK:   ('extrefsymoff', 0)
+// CHECK:   ('nextrefsyms', 0)
+// CHECK:   ('indirectsymoff', 0)
+// CHECK:   ('nindirectsyms', 0)
+// CHECK:   ('extreloff', 0)
+// CHECK:   ('nextrel', 0)
+// CHECK:   ('locreloff', 0)
+// CHECK:   ('nlocrel', 0)
+// CHECK:   ('_indirect_symbols', [
+// CHECK:   ])
+// CHECK:  ),
+// CHECK: ])
diff --git a/test/MC/MachO/gen-dwarf-cpp.s b/test/MC/MachO/gen-dwarf-cpp.s
new file mode 100644
index 0000000000000..cb749f48eef66
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-cpp.s
@@ -0,0 +1,22 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 100 "t.s" 1
+.globl _bar
+_bar:
+	movl	$0, %eax
+L1:	leave
+	ret
+
+// rdar://9275556
+
+// We check that the source name "t.s" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 t.s
+
+// We check that the source line number 100 is picked up before the "movl"
+// CHECK: Address            Line   Column File   ISA Flags
+// CHECK: ------------------ ------ ------ ------ --- -------------
+// CHECK: 0x0000000000000000    102      0      2   0  is_stmt
diff --git a/test/MC/MachO/gen-dwarf-macro-cpp.s b/test/MC/MachO/gen-dwarf-macro-cpp.s
new file mode 100644
index 0000000000000..05a449b4027c2
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-macro-cpp.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 1 "foo.S" 2
+.macro switcher
+        ljmp *0x38(%ecx)
+.endmacro
+        switcher NaClSwitchNoSSE, 0
+
+// PR14264 was a crash in the code caused by the .macro not handled correctly
+// rdar://12637628
+
+// We check that the source name "foo.S" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-macro-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 foo.S
diff --git a/test/MC/MachO/i386-large-relocations.s b/test/MC/MachO/i386-large-relocations.s
new file mode 100644
index 0000000000000..e5a1cfb2c5efe
--- /dev/null
+++ b/test/MC/MachO/i386-large-relocations.s
@@ -0,0 +1,36 @@
+// RUN: llvm-mc -triple i386-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+.space 0x1ed280
+       .section        __DATA,__const
+       .align  4
+.space 0x5181020
+_foo:
+       .long   _bar
+       .long   0
+       .long   _bar+8
+       .long   _bar+24
+       .long   0
+       .long   _bar+16
+
+.zerofill __DATA,__bss,__dummy,0x5d780
+.zerofill __DATA,__bss,_bar,48,4
+
+// Normally scattered relocations are used for sym+offset expressions. When
+// the value exceeds 24-bits, however, it's outside what MachO can encode,
+// so the assembler falls back to non-scattered relocations.
+// rdar://12358909
+
+// CHECK: ('_relocations', [
+// CHECK:   # Relocation 0
+// CHECK:   (('word-0', 0x5181034),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 1
+// CHECK:   (('word-0', 0x518102c),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 2
+// CHECK:   (('word-0', 0x5181028),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 3
+// CHECK:   (('word-0', 0x5181020),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK: ])
diff --git a/test/MC/MachO/lit.local.cfg b/test/MC/MachO/lit.local.cfg
index 6c49f08b7496d..41a8434f99939 100644
--- a/test/MC/MachO/lit.local.cfg
+++ b/test/MC/MachO/lit.local.cfg
@@ -1,4 +1,4 @@
-config.suffixes = ['.s']
+config.suffixes = ['.s', '.ll']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/MC/MachO/x86-data-in-code.ll b/test/MC/MachO/x86-data-in-code.ll
new file mode 100644
index 0000000000000..2410974c5ca37
--- /dev/null
+++ b/test/MC/MachO/x86-data-in-code.ll
@@ -0,0 +1,108 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
+
+; There should not be a data-in-code load command (type 0x29) for x86_64
+; jump tables, even though they are in the text section.
+; CHECK: 'num_load_commands'
+; CHECK-NOT: (('command', 41)
+
+define void @foo(i32* %ptr) nounwind ssp {
+  %tmp = load i32* %ptr, align 4
+  switch i32 %tmp, label %default [
+    i32 11, label %bb0
+    i32 10, label %bb1
+    i32 8, label %bb2
+    i32 4, label %bb3
+    i32 2, label %bb4
+    i32 6, label %bb5
+    i32 9, label %bb6
+    i32 15, label %bb7
+    i32 1, label %bb8
+    i32 3, label %bb9
+    i32 5, label %bb10
+    i32 30, label %bb11
+    i32 31, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 20, label %bb15
+    i32 19, label %bb16
+    i32 17, label %bb17
+    i32 18, label %bb18
+    i32 21, label %bb19
+    i32 22, label %bb20
+    i32 16, label %bb21
+    i32 24, label %bb22
+    i32 25, label %bb23
+    i32 26, label %bb24
+    i32 27, label %bb25
+    i32 28, label %bb26
+    i32 23, label %bb27
+    i32 12, label %bb28
+  ]
+
+default:
+  br label %exit
+bb0:
+  br label %exit
+bb1:
+  br label %exit
+bb2:
+  br label %exit
+bb3:
+  br label %exit
+bb4:
+  br label %exit
+bb5:
+  br label %exit
+bb6:
+  br label %exit
+bb7:
+  br label %exit
+bb8:
+  br label %exit
+bb9:
+  br label %exit
+bb10:
+  br label %exit
+bb11:
+  br label %exit
+bb12:
+  br label %exit
+bb13:
+  br label %exit
+bb14:
+  br label %exit
+bb15:
+  br label %exit
+bb16:
+  br label %exit
+bb17:
+  br label %exit
+bb18:
+  br label %exit
+bb19:
+  br label %exit
+bb20:
+  br label %exit
+bb21:
+  br label %exit
+bb22:
+  br label %exit
+bb23:
+  br label %exit
+bb24:
+  br label %exit
+bb25:
+  br label %exit
+bb26:
+  br label %exit
+bb27:
+  br label %exit
+bb28:
+  br label %exit
+
+
+exit:
+
+  ret void
+}
+
diff --git a/test/MC/Markup/basic-markup.mc b/test/MC/Markup/basic-markup.mc
new file mode 100644
index 0000000000000..2fa5ebb28fa4c
--- /dev/null
+++ b/test/MC/Markup/basic-markup.mc
@@ -0,0 +1,16 @@
+// RUN: llvm-mcmarkup %s | FileCheck %s
+
+	push	{<reg:r1>, <reg:r2>, <reg:r7>}
+	sub	<reg:sp>, <imm:#132>
+	ldr	<reg:r0>, <mem:[<reg:r0>, <imm:#4>]>
+
+
+// CHECK: reg
+// CHECK: reg
+// CHECK: reg
+// CHECK: reg
+// CHECK: imm
+// CHECK: reg
+// CHECK: mem
+// CHECK: reg
+// CHECK: imm
diff --git a/test/MC/Markup/lit.local.cfg b/test/MC/Markup/lit.local.cfg
new file mode 100644
index 0000000000000..ab28eedae2125
--- /dev/null
+++ b/test/MC/Markup/lit.local.cfg
@@ -0,0 +1,2 @@
+config.suffixes = ['.mc']
+
diff --git a/test/MC/Mips/do_switch.ll b/test/MC/Mips/do_switch.ll
new file mode 100644
index 0000000000000..7eda1b41d18c7
--- /dev/null
+++ b/test/MC/Mips/do_switch.ll
@@ -0,0 +1,39 @@
+; This test case will cause an internal EK_GPRel64BlockAddress to be 
+; produced. This was not handled for direct object and an assertion
+; to occur. This is a variation on test case test/CodeGen/Mips/do_switch.ll
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=static
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=pic
+
+; RUN: llc < %s -filetype=obj -march=mips64 -relocation-model=pic -mcpu=mips64 -mattr=n64 
+
+define i32 @main() nounwind readnone {
+entry:
+  %x = alloca i32, align 4                        ; <i32*> [#uses=2]
+  store volatile i32 2, i32* %x, align 4
+  %0 = load volatile i32* %x, align 4             ; <i32> [#uses=1]
+
+  switch i32 %0, label %bb4 [
+    i32 0, label %bb5
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+  ]
+
+bb1:                                              ; preds = %entry
+  ret i32 2
+
+bb2:                                              ; preds = %entry
+  ret i32 0
+
+bb3:                                              ; preds = %entry
+  ret i32 3
+
+bb4:                                              ; preds = %entry
+  ret i32 4
+
+bb5:                                              ; preds = %entry
+  ret i32 1
+}
+
diff --git a/test/MC/Mips/elf-N64.ll b/test/MC/Mips/elf-N64.ll
index 23ec53a2e26da..ae6de78d65520 100644
--- a/test/MC/Mips/elf-N64.ll
+++ b/test/MC/Mips/elf-N64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 -disable-mips-delay-filler %s -o - | elf-dump --dump-section-data  | FileCheck %s
 
 ; Check for N64 relocation production.
 ;
diff --git a/test/MC/Mips/higher_highest.ll b/test/MC/Mips/higher_highest.ll
index 81a89e3040e3d..0c665220335bc 100644
--- a/test/MC/Mips/higher_highest.ll
+++ b/test/MC/Mips/higher_highest.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
-
+; DISABLE: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
+; RUN: false
+; XFAIL: *
+; Disabled because currently we don't have a way to generate these relocations.
+;
 ; Check that the R_MIPS_HIGHER and R_MIPS_HIGHEST relocations were created.
 
 ; CHECK:     ('r_type', 0x1d)
diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
new file mode 100644
index 0000000000000..2997782cd01b4
--- /dev/null
+++ b/test/MC/Mips/mips-alu-instructions.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Logical instructions
+#------------------------------------------------------------------------------
+# CHECK:  and    $9, $6, $7      # encoding: [0x24,0x48,0xc7,0x00]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  clo    $6, $7          # encoding: [0x21,0x30,0xe6,0x70]
+# CHECK:  clz    $6, $7          # encoding: [0x20,0x30,0xe6,0x70]
+# CHECK:  ins    $19, $9, 6, 7   # encoding: [0x84,0x61,0x33,0x7d]
+# CHECK:  nor    $9, $6, $7      # encoding: [0x27,0x48,0xc7,0x00]
+# CHECK:  or     $3, $3, $5      # encoding: [0x25,0x18,0x65,0x00]
+# CHECK:  ori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x34]
+# CHECK:  rotr   $9, $6, 7       # encoding: [0xc2,0x49,0x26,0x00]
+# CHECK:  rotrv  $9, $6, $7      # encoding: [0x46,0x48,0xe6,0x00]
+# CHECK:  sll    $4, $3, 7       # encoding: [0xc0,0x21,0x03,0x00]
+# CHECK:  sllv   $2, $3, $5      # encoding: [0x04,0x10,0xa3,0x00]
+# CHECK:  slt    $3, $3, $5      # encoding: [0x2a,0x18,0x65,0x00]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  sltiu  $3, $3, 103     # encoding: [0x67,0x00,0x63,0x2c]
+# CHECK:  sltu   $3, $3, $5      # encoding: [0x2b,0x18,0x65,0x00]
+# CHECK:  sra    $4, $3, 7       # encoding: [0xc3,0x21,0x03,0x00]
+# CHECK:  srav   $2, $3, $5      # encoding: [0x07,0x10,0xa3,0x00]
+# CHECK:  srl    $4, $3, 7       # encoding: [0xc2,0x21,0x03,0x00]
+# CHECK:  srlv   $2, $3, $5      # encoding: [0x06,0x10,0xa3,0x00]
+# CHECK:  xor    $3, $3, $5      # encoding: [0x26,0x18,0x65,0x00]
+# CHECK:  xori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
+# CHECK:  nor    $7, $8, $zero   # encoding: [0x27,0x38,0x00,0x01]
+     and    $9,  $6, $7
+     and    $9,  $6, 17767
+     andi   $9,  $6, 17767
+     clo    $6,  $7
+     clz    $6,  $7
+     ins    $19, $9, 6,7
+     nor    $9,  $6, $7
+     or     $3,  $3, $5
+     ori    $9,  $6, 17767
+     rotr   $9,  $6, 7
+     rotrv  $9,  $6, $7
+     sll    $4,  $3, 7
+     sllv   $2,  $3, $5
+     slt    $3,  $3, $5
+     slt    $3,  $3, 103
+     slti   $3,  $3, 103
+     sltiu  $3,  $3, 103
+     sltu   $3,  $3, $5
+     sra    $4,  $3, 7
+     srav   $2,  $3, $5
+     srl    $4,  $3, 7
+     srlv   $2,  $3, $5
+     xor    $3,  $3, $5
+     xor    $9,  $6, 17767
+     xori   $9,  $6, 17767
+     wsbh   $6,  $7
+     not    $7  ,$8
+
+#------------------------------------------------------------------------------
+# Arithmetic instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  add    $9, $6, $7      # encoding: [0x20,0x48,0xc7,0x00]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addu   $9, $6, $7      # encoding: [0x21,0x48,0xc7,0x00]
+# CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
+# CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
+# CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
+# CHECK:  msubu  $6, $7          # encoding: [0x05,0x00,0xc7,0x70]
+# CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
+# CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
+# CHECK:  sub    $9, $6, $7      # encoding: [0x22,0x48,0xc7,0x00]
+# CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
+# CHECK:  sub     $6, $zero, $7  # encoding: [0x22,0x30,0x07,0x00]
+# CHECK:  subu    $6, $zero, $7  # encoding: [0x23,0x30,0x07,0x00]
+# CHECK:  add     $7, $8, $zero  # encoding: [0x20,0x38,0x00,0x01]
+    add    $9,$6,$7
+    add    $9,$6,17767
+    addu   $9,$6,-15001
+    addi   $9,$6,17767
+    addiu  $9,$6,-15001
+    addu   $9,$6,$7
+    madd   $6,$7
+    maddu  $6,$7
+    msub   $6,$7
+    msubu  $6,$7
+    mult   $3,$5
+    multu  $3,$5
+    sub    $9,$6,$7
+    subu   $4,$3,$5
+    neg    $6,$7
+    negu   $6,$7
+    move   $7,$8
diff --git a/test/MC/Mips/mips-coprocessor-encodings.s b/test/MC/Mips/mips-coprocessor-encodings.s
new file mode 100644
index 0000000000000..bad9163ba9fa4
--- /dev/null
+++ b/test/MC/Mips/mips-coprocessor-encodings.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck --check-prefix=MIPS64 %s
+
+# MIPS64:	dmtc0	$12, $16, 2             # encoding: [0x40,0xac,0x80,0x02]
+# MIPS64:	dmtc0	$12, $16, 0             # encoding: [0x40,0xac,0x80,0x00]
+# MIPS64:	mtc0	$12, $16, 2             # encoding: [0x40,0x8c,0x80,0x02]
+# MIPS64:	mtc0	$12, $16, 0             # encoding: [0x40,0x8c,0x80,0x00]
+# MIPS64:	dmfc0	$12, $16, 2             # encoding: [0x40,0x2c,0x80,0x02]
+# MIPS64:	dmfc0	$12, $16, 0             # encoding: [0x40,0x2c,0x80,0x00]
+# MIPS64:	mfc0	$12, $16, 2             # encoding: [0x40,0x0c,0x80,0x02]
+# MIPS64:	mfc0	$12, $16, 0             # encoding: [0x40,0x0c,0x80,0x00]
+
+	dmtc0	$12, $16, 2
+	dmtc0	$12, $16
+	mtc0	$12, $16, 2
+	mtc0	$12, $16
+	dmfc0	$12, $16, 2
+	dmfc0	$12, $16
+	mfc0	$12, $16, 2
+	mfc0	$12, $16
+
+# MIPS64:	dmtc2	$12, $16, 2             # encoding: [0x48,0xac,0x80,0x02]
+# MIPS64:	dmtc2	$12, $16, 0             # encoding: [0x48,0xac,0x80,0x00]
+# MIPS64:	mtc2	$12, $16, 2             # encoding: [0x48,0x8c,0x80,0x02]
+# MIPS64:	mtc2	$12, $16, 0             # encoding: [0x48,0x8c,0x80,0x00]
+# MIPS64:	dmfc2	$12, $16, 2             # encoding: [0x48,0x2c,0x80,0x02]
+# MIPS64:	dmfc2	$12, $16, 0             # encoding: [0x48,0x2c,0x80,0x00]
+# MIPS64:	mfc2	$12, $16, 2             # encoding: [0x48,0x0c,0x80,0x02]
+# MIPS64:	mfc2	$12, $16, 0             # encoding: [0x48,0x0c,0x80,0x00]
+
+	dmtc2	$12, $16, 2
+	dmtc2	$12, $16
+	mtc2	$12, $16, 2
+	mtc2	$12, $16
+	dmfc2	$12, $16, 2
+	dmfc2	$12, $16
+	mfc2	$12, $16, 2
+	mfc2	$12, $16
diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
new file mode 100644
index 0000000000000..cfc15e883a951
--- /dev/null
+++ b/test/MC/Mips/mips-expansions.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for macro instructions
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Load immediate instructions
+#------------------------------------------------------------------------------
+# CHECK: ori     $5, $zero, 123      # encoding: [0x7b,0x00,0x05,0x34]
+# CHECK: addiu   $6, $zero, -2345    # encoding: [0xd7,0xf6,0x06,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $4, $zero, 20       # encoding: [0x14,0x00,0x04,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $4, $5, 20          # encoding: [0x14,0x00,0xa4,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addu    $7, $7, $8          # encoding: [0x21,0x38,0xe8,0x00]
+
+    li $5,123
+    li $6,-2345
+    li $7,65538
+
+    la $a0, 20
+    la $7,65538
+    la $a0, 20($a1)
+    la $7,65538($8)
diff --git a/test/MC/Mips/mips-fpu-instructions.s b/test/MC/Mips/mips-fpu-instructions.s
new file mode 100644
index 0000000000000..a126c6f7188cd
--- /dev/null
+++ b/test/MC/Mips/mips-fpu-instructions.s
@@ -0,0 +1,178 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for FPU instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# FP aritmetic  instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  abs.d      $f12, $f14         # encoding: [0x05,0x73,0x20,0x46]
+# CHECK:  abs.s      $f6, $f7           # encoding: [0x85,0x39,0x00,0x46]
+# CHECK:  add.d      $f8, $f12, $f14    # encoding: [0x00,0x62,0x2e,0x46]
+# CHECK:  add.s      $f9, $f6, $f7      # encoding: [0x40,0x32,0x07,0x46]
+# CHECK:  floor.w.d  $f12, $f14         # encoding: [0x0f,0x73,0x20,0x46]
+# CHECK:  floor.w.s  $f6, $f7           # encoding: [0x8f,0x39,0x00,0x46]
+# CHECK:  ceil.w.d   $f12, $f14         # encoding: [0x0e,0x73,0x20,0x46]
+# CHECK:  ceil.w.s   $f6, $f7           # encoding: [0x8e,0x39,0x00,0x46]
+# CHECK:  mul.d      $f8, $f12, $f14    # encoding: [0x02,0x62,0x2e,0x46]
+# CHECK:  mul.s      $f9, $f6, $f7      # encoding: [0x42,0x32,0x07,0x46]
+# CHECK:  neg.d      $f12, $f14         # encoding: [0x07,0x73,0x20,0x46]
+# CHECK:  neg.s      $f6, $f7           # encoding: [0x87,0x39,0x00,0x46]
+# CHECK:  round.w.d  $f12, $f14         # encoding: [0x0c,0x73,0x20,0x46]
+# CHECK:  round.w.s  $f6, $f7           # encoding: [0x8c,0x39,0x00,0x46]
+# CHECK:  sqrt.d     $f12, $f14         # encoding: [0x04,0x73,0x20,0x46]
+# CHECK:  sqrt.s     $f6, $f7           # encoding: [0x84,0x39,0x00,0x46]
+# CHECK:  sub.d      $f8, $f12, $f14    # encoding: [0x01,0x62,0x2e,0x46]
+# CHECK:  sub.s      $f9, $f6, $f7      # encoding: [0x41,0x32,0x07,0x46]
+# CHECK:  trunc.w.d  $f12, $f14         # encoding: [0x0d,0x73,0x20,0x46]
+# CHECK:  trunc.w.s  $f6, $f7           # encoding: [0x8d,0x39,0x00,0x46]
+
+    abs.d      $f12,$f14
+    abs.s      $f6,$f7
+    add.d      $f8,$f12,$f14
+    add.s      $f9,$f6,$f7
+    floor.w.d  $f12,$f14
+    floor.w.s  $f6,$f7
+    ceil.w.d   $f12,$f14
+    ceil.w.s   $f6,$f7
+    mul.d      $f8,$f12,$f14
+    mul.s      $f9,$f6, $f7
+    neg.d      $f12,$f14
+    neg.s      $f6,$f7
+    round.w.d  $f12,$f14
+    round.w.s  $f6,$f7
+    sqrt.d     $f12,$f14
+    sqrt.s     $f6,$f7
+    sub.d      $f8,$f12,$f14
+    sub.s      $f9,$f6,$f7
+    trunc.w.d  $f12,$f14
+    trunc.w.s  $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP compare instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  c.eq.d    $f12, $f14        # encoding: [0x32,0x60,0x2e,0x46]
+# CHECK:  c.eq.s    $f6, $f7          # encoding: [0x32,0x30,0x07,0x46]
+# CHECK:  c.f.d     $f12, $f14        # encoding: [0x30,0x60,0x2e,0x46]
+# CHECK:  c.f.s     $f6, $f7          # encoding: [0x30,0x30,0x07,0x46]
+# CHECK:  c.le.d    $f12, $f14        # encoding: [0x3e,0x60,0x2e,0x46]
+# CHECK:  c.le.s    $f6, $f7          # encoding: [0x3e,0x30,0x07,0x46]
+# CHECK:  c.lt.d    $f12, $f14        # encoding: [0x3c,0x60,0x2e,0x46]
+# CHECK:  c.lt.s    $f6, $f7          # encoding: [0x3c,0x30,0x07,0x46]
+# CHECK:  c.nge.d   $f12, $f14        # encoding: [0x3d,0x60,0x2e,0x46]
+# CHECK:  c.nge.s   $f6, $f7          # encoding: [0x3d,0x30,0x07,0x46]
+# CHECK:  c.ngl.d   $f12, $f14        # encoding: [0x3b,0x60,0x2e,0x46]
+# CHECK:  c.ngl.s   $f6, $f7          # encoding: [0x3b,0x30,0x07,0x46]
+# CHECK:  c.ngle.d  $f12, $f14        # encoding: [0x39,0x60,0x2e,0x46]
+# CHECK:  c.ngle.s  $f6, $f7          # encoding: [0x39,0x30,0x07,0x46]
+# CHECK:  c.ngt.d   $f12, $f14        # encoding: [0x3f,0x60,0x2e,0x46]
+# CHECK:  c.ngt.s   $f6, $f7          # encoding: [0x3f,0x30,0x07,0x46]
+# CHECK:  c.ole.d   $f12, $f14        # encoding: [0x36,0x60,0x2e,0x46]
+# CHECK:  c.ole.s   $f6, $f7          # encoding: [0x36,0x30,0x07,0x46]
+# CHECK:  c.olt.d   $f12, $f14        # encoding: [0x34,0x60,0x2e,0x46]
+# CHECK:  c.olt.s   $f6, $f7          # encoding: [0x34,0x30,0x07,0x46]
+# CHECK:  c.seq.d   $f12, $f14        # encoding: [0x3a,0x60,0x2e,0x46]
+# CHECK:  c.seq.s   $f6, $f7          # encoding: [0x3a,0x30,0x07,0x46]
+# CHECK:  c.sf.d    $f12, $f14        # encoding: [0x38,0x60,0x2e,0x46]
+# CHECK:  c.sf.s    $f6, $f7          # encoding: [0x38,0x30,0x07,0x46]
+# CHECK:  c.ueq.d   $f12, $f14        # encoding: [0x33,0x60,0x2e,0x46]
+# CHECK:  c.ueq.s   $f28, $f18        # encoding: [0x33,0xe0,0x12,0x46]
+# CHECK:  c.ule.d   $f12, $f14        # encoding: [0x37,0x60,0x2e,0x46]
+# CHECK:  c.ule.s   $f6, $f7          # encoding: [0x37,0x30,0x07,0x46]
+# CHECK:  c.ult.d   $f12, $f14        # encoding: [0x35,0x60,0x2e,0x46]
+# CHECK:  c.ult.s   $f6, $f7          # encoding: [0x35,0x30,0x07,0x46]
+# CHECK:  c.un.d    $f12, $f14        # encoding: [0x31,0x60,0x2e,0x46]
+# CHECK:  c.un.s    $f6, $f7          # encoding: [0x31,0x30,0x07,0x46]
+
+     c.eq.d    $f12,$f14
+     c.eq.s    $f6,$f7
+     c.f.d     $f12,$f14
+     c.f.s     $f6,$f7
+     c.le.d    $f12,$f14
+     c.le.s    $f6,$f7
+     c.lt.d    $f12,$f14
+     c.lt.s    $f6,$f7
+     c.nge.d   $f12,$f14
+     c.nge.s   $f6,$f7
+     c.ngl.d   $f12,$f14
+     c.ngl.s   $f6,$f7
+     c.ngle.d  $f12,$f14
+     c.ngle.s  $f6,$f7
+     c.ngt.d   $f12,$f14
+     c.ngt.s   $f6,$f7
+     c.ole.d   $f12,$f14
+     c.ole.s   $f6,$f7
+     c.olt.d   $f12,$f14
+     c.olt.s   $f6,$f7
+     c.seq.d   $f12,$f14
+     c.seq.s   $f6,$f7
+     c.sf.d    $f12,$f14
+     c.sf.s    $f6,$f7
+     c.ueq.d   $f12,$f14
+     c.ueq.s   $f28,$f18
+     c.ule.d   $f12,$f14
+     c.ule.s   $f6,$f7
+     c.ult.d   $f12,$f14
+     c.ult.s   $f6,$f7
+     c.un.d    $f12,$f14
+     c.un.s    $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP convert instructions
+#------------------------------------------------------------------------------
+# CHECK:  cvt.d.s   $f6, $f7          # encoding: [0xa1,0x39,0x00,0x46]
+# CHECK:  cvt.d.w   $f12, $f14        # encoding: [0x21,0x73,0x80,0x46]
+# CHECK:  cvt.s.d   $f12, $f14        # encoding: [0x20,0x73,0x20,0x46]
+# CHECK:  cvt.s.w   $f6, $f7          # encoding: [0xa0,0x39,0x80,0x46]
+# CHECK:  cvt.w.d   $f12, $f14        # encoding: [0x24,0x73,0x20,0x46]
+# CHECK:  cvt.w.s   $f6, $f7          # encoding: [0xa4,0x39,0x00,0x46]
+
+  cvt.d.s   $f6,$f7
+  cvt.d.w   $f12,$f14
+  cvt.s.d   $f12,$f14
+  cvt.s.w   $f6,$f7
+  cvt.w.d   $f12,$f14
+  cvt.w.s   $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP move instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  cfc1    $6, $fcc0            # encoding: [0x00,0x00,0x46,0x44]
+# CHECK:  mfc1    $6, $f7              # encoding: [0x00,0x38,0x06,0x44]
+# CHECK:  mfhi    $5                   # encoding: [0x10,0x28,0x00,0x00]
+# CHECK:  mflo    $5                   # encoding: [0x12,0x28,0x00,0x00]
+# CHECK:  mov.d   $f6, $f8             # encoding: [0x86,0x41,0x20,0x46]
+# CHECK:  mov.s   $f6, $f7             # encoding: [0x86,0x39,0x00,0x46]
+# CHECK:  mtc1    $6, $f7              # encoding: [0x00,0x38,0x86,0x44]
+# CHECK:  mthi    $7                   # encoding: [0x11,0x00,0xe0,0x00]
+# CHECK:  mtlo    $7                   # encoding: [0x13,0x00,0xe0,0x00]
+# CHECK:  swc1    $f9, 9158($7)        # encoding: [0xc6,0x23,0xe9,0xe4]
+# CHECK:  mfc0    $6, $7, 0               # encoding: [0x00,0x38,0x06,0x40]
+# CHECK:  mtc0    $9, $8, 0               # encoding: [0x00,0x40,0x89,0x40]
+# CHECK:  mfc2    $5, $7, 0               # encoding: [0x00,0x38,0x05,0x48]
+# CHECK:  mtc2    $9, $4, 0               # encoding: [0x00,0x20,0x89,0x48]
+# CHECK:  mfc0    $6, $7, 2               # encoding: [0x02,0x38,0x06,0x40]
+# CHECK:  mtc0    $9, $8, 3               # encoding: [0x03,0x40,0x89,0x40]
+# CHECK:  mfc2    $5, $7, 4               # encoding: [0x04,0x38,0x05,0x48]
+# CHECK:  mtc2    $9, $4, 5               # encoding: [0x05,0x20,0x89,0x48]
+
+   cfc1    $a2,$0
+   mfc1    $a2,$f7
+   mfhi    $a1
+   mflo    $a1
+   mov.d   $f6,$f8
+   mov.s   $f6,$f7
+   mtc1    $a2,$f7
+   mthi    $a3
+   mtlo    $a3
+   swc1    $f9,9158($a3)
+   mfc0    $6, $7
+   mtc0    $9, $8
+   mfc2    $5, $7
+   mtc2    $9, $4
+   mfc0    $6, $7, 2
+   mtc0    $9, $8, 3
+   mfc2    $5, $7, 4
+   mtc2    $9, $4, 5
diff --git a/test/MC/Mips/mips-jump-instructions.s b/test/MC/Mips/mips-jump-instructions.s
new file mode 100644
index 0000000000000..998be418d2049
--- /dev/null
+++ b/test/MC/Mips/mips-jump-instructions.s
@@ -0,0 +1,72 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for jumps and branches.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Branch instructions
+#------------------------------------------------------------------------------
+# CHECK:   b 1332                 # encoding: [0x34,0x05,0x00,0x10]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1f 1332              # encoding: [0x34,0x05,0x00,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1t 1332              # encoding: [0x34,0x05,0x01,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   beq $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x11]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgez $6, 1332          # encoding: [0x34,0x05,0xc1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgezal $6, 1332        # encoding: [0x34,0x05,0xd1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgtz $6, 1332          # encoding: [0x34,0x05,0xc0,0x1c]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   blez $6, 1332          # encoding: [0x34,0x05,0xc0,0x18]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bne $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x15]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bal     1332           # encoding: [0x34,0x05,0x00,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+         b 1332
+         nop
+         bc1f 1332
+         nop
+         bc1t 1332
+         nop
+         beq $9,$6,1332
+         nop
+         bgez $6,1332
+         nop
+         bgezal $6,1332
+         nop
+         bgtz $6,1332
+         nop
+         blez $6,1332
+         nop
+         bne $9,$6,1332
+         nop
+         bal 1332
+         nop
+
+end_of_code:
+#------------------------------------------------------------------------------
+# Jump instructions
+#------------------------------------------------------------------------------
+# CHECK:   j 1328               # encoding: [0x30,0x05,0x00,0x08]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jal 1328             # encoding: [0x30,0x05,0x00,0x0c]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jalr $6              # encoding: [0x09,0xf8,0xc0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+
+
+   j 1328
+   nop
+   jal 1328
+   nop
+   jalr $6
+   nop
+   jr $7
+   nop
+   j $7
diff --git a/test/MC/Mips/mips-memory-instructions.s b/test/MC/Mips/mips-memory-instructions.s
new file mode 100644
index 0000000000000..b5f1267ef386c
--- /dev/null
+++ b/test/MC/Mips/mips-memory-instructions.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for loads and stores.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Memory store instructions
+#------------------------------------------------------------------------------
+# CHECK:  sb      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa0]
+# CHECK:  sc      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xe0]
+# CHECK:  sh      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa4]
+# CHECK:  sw      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xac]
+# CHECK:  sw      $7,  0($5)      # encoding: [0x00,0x00,0xa7,0xac]
+# CHECK:  swc1    $f2, 16($5)     # encoding: [0x10,0x00,0xa2,0xe4]
+# CHECK:  swl     $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa8]
+     sb   $4, 16($5)
+     sc   $4, 16($5)
+     sh   $4, 16($5)
+     sw   $4, 16($5)
+     sw   $7,   ($5)
+     swc1 $f2, 16($5)
+     swl  $4, 16($5)
+
+#------------------------------------------------------------------------------
+# Memory load instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  lb  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x80]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lbu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x90]
+# CHECK:  lh  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x84]
+# CHECK:  lhu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x94]
+# CHECK:  ll  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0xc0]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lw  $7, 0($7)       # encoding: [0x00,0x00,0xe7,0x8c]
+# CHECK:  lw  $2, 16($sp)     # encoding: [0x10,0x00,0xa2,0x8f]
+
+      lb      $4, 4($5)
+      lw      $4, 4($5)
+      lbu     $4, 4($5)
+      lh      $4, 4($5)
+      lhu     $4, 4($5)
+      ll      $4, 4($5)
+      lw      $4, 4($5)
+      lw      $7,    ($7)
+      lw      $2, 16($sp)
diff --git a/test/MC/Mips/mips-register-names.s b/test/MC/Mips/mips-register-names.s
new file mode 100644
index 0000000000000..26187ce58875e
--- /dev/null
+++ b/test/MC/Mips/mips-register-names.s
@@ -0,0 +1,71 @@
+# RUN: llvm-mc %s -triple=mips-unknown-freebsd -show-encoding | FileCheck %s
+
+# Check that the register names are mapped to their correct numbers for o32
+# Second byte of addiu with $zero at rt contains the number of the source
+# register.
+
+# CHECK: encoding: [0x24,0x00,0x00,0x00]
+# CHECK: encoding: [0x24,0x01,0x00,0x00]
+# CHECK: encoding: [0x24,0x02,0x00,0x00]
+# CHECK: encoding: [0x24,0x03,0x00,0x00]
+# CHECK: encoding: [0x24,0x04,0x00,0x00]
+# CHECK: encoding: [0x24,0x05,0x00,0x00]
+# CHECK: encoding: [0x24,0x06,0x00,0x00]
+# CHECK: encoding: [0x24,0x07,0x00,0x00]
+# CHECK: encoding: [0x24,0x08,0x00,0x00]
+# CHECK: encoding: [0x24,0x09,0x00,0x00]
+# CHECK: encoding: [0x24,0x0a,0x00,0x00]
+# CHECK: encoding: [0x24,0x0b,0x00,0x00]
+# CHECK: encoding: [0x24,0x0c,0x00,0x00]
+# CHECK: encoding: [0x24,0x0d,0x00,0x00]
+# CHECK: encoding: [0x24,0x0e,0x00,0x00]
+# CHECK: encoding: [0x24,0x0f,0x00,0x00]
+# CHECK: encoding: [0x24,0x10,0x00,0x00]
+# CHECK: encoding: [0x24,0x11,0x00,0x00]
+# CHECK: encoding: [0x24,0x12,0x00,0x00]
+# CHECK: encoding: [0x24,0x13,0x00,0x00]
+# CHECK: encoding: [0x24,0x14,0x00,0x00]
+# CHECK: encoding: [0x24,0x15,0x00,0x00]
+# CHECK: encoding: [0x24,0x16,0x00,0x00]
+# CHECK: encoding: [0x24,0x17,0x00,0x00]
+# CHECK: encoding: [0x24,0x18,0x00,0x00]
+# CHECK: encoding: [0x24,0x19,0x00,0x00]
+# CHECK: encoding: [0x24,0x1a,0x00,0x00]
+# CHECK: encoding: [0x24,0x1b,0x00,0x00]
+# CHECK: encoding: [0x24,0x1c,0x00,0x00]
+# CHECK: encoding: [0x24,0x1d,0x00,0x00]
+# CHECK: encoding: [0x24,0x1e,0x00,0x00]
+# CHECK: encoding: [0x24,0x1f,0x00,0x00]
+addiu	$zero, $zero, 0
+addiu	$at, $zero, 0
+addiu	$v0, $zero, 0
+addiu	$v1, $zero, 0
+addiu	$a0, $zero, 0
+addiu	$a1, $zero, 0
+addiu	$a2, $zero, 0
+addiu	$a3, $zero, 0
+addiu	$t0, $zero, 0
+addiu	$t1, $zero, 0
+addiu	$t2, $zero, 0
+addiu	$t3, $zero, 0
+addiu	$t4, $zero, 0
+addiu	$t5, $zero, 0
+addiu	$t6, $zero, 0
+addiu	$t7, $zero, 0
+addiu	$s0, $zero, 0
+addiu	$s1, $zero, 0
+addiu	$s2, $zero, 0
+addiu	$s3, $zero, 0
+addiu	$s4, $zero, 0
+addiu	$s5, $zero, 0
+addiu	$s6, $zero, 0
+addiu	$s7, $zero, 0
+addiu	$t8, $zero, 0
+addiu	$t9, $zero, 0
+addiu	$k0, $zero, 0
+addiu	$k1, $zero, 0
+addiu	$gp, $zero, 0
+addiu	$sp, $zero, 0
+addiu	$fp, $zero, 0
+addiu	$sp, $zero, 0
+addiu	$ra, $zero, 0
diff --git a/test/MC/Mips/mips-relocations.s b/test/MC/Mips/mips-relocations.s
new file mode 100644
index 0000000000000..ff71c7559cd01
--- /dev/null
+++ b/test/MC/Mips/mips-relocations.s
@@ -0,0 +1,41 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for relocations.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+# CHECK:  lui   $2, %hi(_gp_disp)     # encoding: [A,A,0x02,0x3c]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_HI, kind: fixup_Mips_HI16
+# CHECK:  addiu $2, $2, %lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_LO, kind: fixup_Mips_LO16
+# CHECK:  lw    $25, %call16(strchr)($gp)   # encoding: [A,A,0x99,0x8f]
+# CHECK:                                    #   fixup A - offset: 0, value: strchr@GOT_CALL, kind: fixup_Mips_CALL16
+# CHECK:  lw      $3, %got(loop_1)($2)    # encoding: [A,A,0x43,0x8c]
+# CHECK:                                  #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lui     $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x02,0x3c]
+# CHECK:                                        #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  addiu   $2, $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                  #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  lw      $3, %got(loop_1)($2)      # encoding: [A,A,0x43,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lw      $4, %got_disp(loop_2)($3) # encoding: [A,A,0x64,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_2@GOT_DISP, kind: fixup_Mips_GOT_DISP
+# CHECK:  lw      $5, %got_page(loop_3)($4) # encoding: [A,A,0x85,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_3@GOT_PAGE, kind: fixup_Mips_GOT_PAGE
+# CHECK:  lw      $6, %got_ofst(loop_4)($5) # encoding: [A,A,0xa6,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_4@GOT_OFST, kind: fixup_Mips_GOT_OFST
+# CHECK:  lui     $2, %tprel_hi(_gp_disp)   # encoding: [A,A,0x02,0x3c]
+# CHECK:                                    #   fixup A - offset: 0, value: _gp_disp@TPREL_HI, kind: fixup_Mips_TPREL_HI
+# CHECK:  addiu   $2, $2, %tprel_lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                      #   fixup A - offset: 0, value: _gp_disp@TPREL_LO, kind: fixup_Mips_TPREL_LO
+
+    lui	$2, %hi(_gp_disp)
+	  addiu	$2, $2, %lo(_gp_disp)
+    lw	$25, %call16(strchr)($gp)
+    lw      $3, %got(loop_1)($2)
+    lui	$2, %dtprel_hi(_gp_disp)
+	  addiu	$2, $2, %dtprel_hi(_gp_disp)
+    lw	$3, %got(loop_1)($2)
+    lw	$4, %got_disp(loop_2)($3)
+    lw	$5, %got_page(loop_3)($4)
+    lw	$6, %got_ofst(loop_4)($5)
+    lui	$2, %tprel_hi(_gp_disp)
+	  addiu	$2, $2, %tprel_lo(_gp_disp)
diff --git a/test/MC/Mips/mips64-register-names.s b/test/MC/Mips/mips64-register-names.s
new file mode 100644
index 0000000000000..16783ee1a68ca
--- /dev/null
+++ b/test/MC/Mips/mips64-register-names.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck %s
+
+# Check that the register names are mapped to their correct numbers for n64
+# Second byte of addiu with $zero at rt contains the number of the source
+# register.
+
+# CHECK: encoding: [0x64,0x00,0x00,0x00]
+# CHECK: encoding: [0x64,0x01,0x00,0x00]
+# CHECK: encoding: [0x64,0x02,0x00,0x00]
+# CHECK: encoding: [0x64,0x03,0x00,0x00]
+# CHECK: encoding: [0x64,0x04,0x00,0x00]
+# CHECK: encoding: [0x64,0x05,0x00,0x00]
+# CHECK: encoding: [0x64,0x06,0x00,0x00]
+# CHECK: encoding: [0x64,0x07,0x00,0x00]
+# CHECK: encoding: [0x64,0x08,0x00,0x00]
+# CHECK: encoding: [0x64,0x09,0x00,0x00]
+# CHECK: encoding: [0x64,0x0a,0x00,0x00]
+# CHECK: encoding: [0x64,0x0b,0x00,0x00]
+# CHECK: encoding: [0x64,0x0c,0x00,0x00]
+# CHECK: encoding: [0x64,0x0d,0x00,0x00]
+# CHECK: encoding: [0x64,0x0e,0x00,0x00]
+# CHECK: encoding: [0x64,0x0f,0x00,0x00]
+# CHECK: encoding: [0x64,0x10,0x00,0x00]
+# CHECK: encoding: [0x64,0x11,0x00,0x00]
+# CHECK: encoding: [0x64,0x12,0x00,0x00]
+# CHECK: encoding: [0x64,0x13,0x00,0x00]
+# CHECK: encoding: [0x64,0x14,0x00,0x00]
+# CHECK: encoding: [0x64,0x15,0x00,0x00]
+# CHECK: encoding: [0x64,0x16,0x00,0x00]
+# CHECK: encoding: [0x64,0x17,0x00,0x00]
+# CHECK: encoding: [0x64,0x18,0x00,0x00]
+# CHECK: encoding: [0x64,0x19,0x00,0x00]
+# CHECK: encoding: [0x64,0x1a,0x00,0x00]
+# CHECK: encoding: [0x64,0x1b,0x00,0x00]
+# CHECK: encoding: [0x64,0x1c,0x00,0x00]
+# CHECK: encoding: [0x64,0x1d,0x00,0x00]
+# CHECK: encoding: [0x64,0x1e,0x00,0x00]
+# CHECK: encoding: [0x64,0x1f,0x00,0x00]
+daddiu	$zero, $zero, 0
+daddiu	$at, $zero, 0
+daddiu	$v0, $zero, 0
+daddiu	$v1, $zero, 0
+daddiu	$a0, $zero, 0
+daddiu	$a1, $zero, 0
+daddiu	$a2, $zero, 0
+daddiu	$a3, $zero, 0
+daddiu	$a4, $zero, 0
+daddiu	$a5, $zero, 0
+daddiu	$a6, $zero, 0
+daddiu	$a7, $zero, 0
+daddiu	$t4, $zero, 0
+daddiu	$t5, $zero, 0
+daddiu	$t6, $zero, 0
+daddiu	$t7, $zero, 0
+daddiu	$s0, $zero, 0
+daddiu	$s1, $zero, 0
+daddiu	$s2, $zero, 0
+daddiu	$s3, $zero, 0
+daddiu	$s4, $zero, 0
+daddiu	$s5, $zero, 0
+daddiu	$s6, $zero, 0
+daddiu	$s7, $zero, 0
+daddiu	$t8, $zero, 0
+daddiu	$t9, $zero, 0
+daddiu	$kt0, $zero, 0
+daddiu	$kt1, $zero, 0
+daddiu	$gp, $zero, 0
+daddiu	$sp, $zero, 0
+daddiu	$s8, $zero, 0
+daddiu	$ra, $zero, 0
diff --git a/test/MC/Mips/mips64extins.ll b/test/MC/Mips/mips64extins.ll
new file mode 100644
index 0000000000000..ebe8f86513fd9
--- /dev/null
+++ b/test/MC/Mips/mips64extins.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -mattr=n64 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 - \
+; RUN: | FileCheck %s
+
+define i64 @dext(i64 %i) nounwind readnone {
+entry:
+; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 5, 10
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 1023
+  ret i64 %and
+}
+
+define i64 @dextu(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextu ${{[0-9]+}}, ${{[0-9]+}}, 2, 6
+  %shr = lshr i64 %i, 34
+  %and = and i64 %shr, 63
+  ret i64 %and
+}
+
+define i64 @dextm(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextm ${{[0-9]+}}, ${{[0-9]+}}, 5, 2
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 17179869183
+  ret i64 %and
+}
+
+define i64 @dins(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 8, 10
+  %shl2 = shl i64 %j, 8
+  %and = and i64 %shl2, 261888
+  %and3 = and i64 %i, -261889
+  %or = or i64 %and3, %and
+  ret i64 %or
+}
+
+define i64 @dinsm(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsm ${{[0-9]+}}, ${{[0-9]+}}, 10, 1
+  %shl4 = shl i64 %j, 10
+  %and = and i64 %shl4, 8796093021184
+  %and5 = and i64 %i, -8796093021185
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
+
+define i64 @dinsu(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsu ${{[0-9]+}}, ${{[0-9]+}}, 8, 13
+  %shl4 = shl i64 %j, 40
+  %and = and i64 %shl4, 9006099743113216
+  %and5 = and i64 %i, -9006099743113217
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
diff --git a/test/MC/Mips/mips64shift.ll b/test/MC/Mips/mips64shift.ll
index 7817b96fa594c..99cac7b591fac 100644
--- a/test/MC/Mips/mips64shift.ll
+++ b/test/MC/Mips/mips64shift.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - | llvm-objdump -disassemble -triple mips64el - | FileCheck %s
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
 define i64 @f3(i64 %a0) nounwind readnone {
 entry:
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
new file mode 100644
index 0000000000000..e2f75a827d0a1
--- /dev/null
+++ b/test/MC/Mips/mips_directives.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -triple mips-unknown-unknown %s
+#this test produces no output so there isS no FileCheck call
+$BB0_2:
+  .ent directives_test
+	.frame	$sp,0,$ra
+	.mask 	0x00000000,0
+	.fmask	0x00000000,0
+	.set	noreorder
+	.set	nomacro
+	.set	noat
+$JTI0_0:
+	.gpword	($BB0_2)
+	.set  at=$12
+	.set macro
+	.set reorder
+	.end directives_test
diff --git a/test/MC/Mips/multi-64bit-func.ll b/test/MC/Mips/multi-64bit-func.ll
index 6e0d784e07f64..83577aa1628bd 100644
--- a/test/MC/Mips/multi-64bit-func.ll
+++ b/test/MC/Mips/multi-64bit-func.ll
@@ -1,8 +1,8 @@
 ; There is no real check here. If the test doesn't 
 ; assert it passes.
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler < %s 
 ; Run it again without extra nop in delay slot
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -enable-mips-delay-filler < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
 
 define i32 @bosco1(i32 %x) nounwind readnone {
 entry:
diff --git a/test/MC/Mips/sext_64_32.ll b/test/MC/Mips/sext_64_32.ll
index e5c57b8c41d8c..9e0cfa01fdfca 100644
--- a/test/MC/Mips/sext_64_32.ll
+++ b/test/MC/Mips/sext_64_32.ll
@@ -2,7 +2,7 @@
 
 ; Sign extend from 32 to 64 was creating nonsense opcodes
 
-; CHECK: sll ${{[0-9]+}}, ${{[0-9]+}}, 0
+; CHECK: sll ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
 
 define i64 @foo(i32 %ival) nounwind readnone {
 entry:
@@ -10,7 +10,7 @@ entry:
   ret i64 %conv
 }
 
-; CHECK: dsll32 ${{[0-9]+}}, ${{[0-9]+}}, 0
+; CHECK: dsll32 ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
 
 define i64 @foo_2(i32 %ival_2) nounwind readnone {
 entry:
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
new file mode 100644
index 0000000000000..88488cdd048e3
--- /dev/null
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll', '.c', '.cpp', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.ll b/test/MC/PowerPC/ppc64-initial-cfa.ll
new file mode 100644
index 0000000000000..3936cf2e81e57
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-initial-cfa.ll
@@ -0,0 +1,41 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file should be in .s form, change when asm parser is available.
+
+define void @f() {
+entry:
+  ret void
+}
+
+;; CHECK:      ('sh_name', 0x{{.*}}) # '.eh_frame'
+;; CHECK-NEXT: ('sh_type', 0x00000001)
+;; CHECK-NEXT: ('sh_flags', 0x0000000000000002)
+;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
+;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
+;; CHECK-NEXT: ('sh_size', 0x0000000000000030)
+;; CHECK-NEXT: ('sh_link', 0x00000000)
+;; CHECK-NEXT: ('sh_info', 0x00000000)
+;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
+;; CHECK-NEXT: ('sh_entsize', 0x0000000000000000)
+;; CHECK-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 000c0100 00000018 00000018 00000000 00000000 00000000 00000010 00000000')
+
+;; CHECK:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
+;; CHECK-NEXT: ('sh_type', 0x00000004)
+;; CHECK-NEXT: ('sh_flags', 0x0000000000000000)
+;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
+;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
+;; CHECK-NEXT: ('sh_size', 0x0000000000000018)
+;; CHECK-NEXT: ('sh_link', 0x{{.*}})
+;; CHECK-NEXT: ('sh_info', 0x{{.*}})
+;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
+;; CHECK-NEXT: ('sh_entsize', 0x0000000000000018)
+;; CHECK-NEXT: ('_relocations', [
+;; CHECK-NEXT:  # Relocation 0
+;; CHECK-NEXT:  (('r_offset', 0x000000000000001c)
+;; CHECK-NEXT:   ('r_sym', 0x{{.*}})
+;; CHECK-NEXT:   ('r_type', 0x00000026)
+;; CHECK-NEXT:   ('r_addend', 0x0000000000000000)
+;; CHECK-NEXT:  ),
+;; CHECK-NEXT: ])
+
diff --git a/test/MC/PowerPC/ppc64-relocs-01.ll b/test/MC/PowerPC/ppc64-relocs-01.ll
new file mode 100644
index 0000000000000..5996af84f4488
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-relocs-01.ll
@@ -0,0 +1,66 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -O3  \
+;; RUN:  -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file need to be in .s form, change when asm parse is done.
+
+@number64 = global i64 10, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+  %0 = load i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+declare double @sin(double) nounwind
+
+define double @test_branch24 (double %x) nounwind readonly {
+entry:
+  %add = call double @sin(double %x) nounwind
+  ret double %add
+}
+
+;; The relocations in .rela.text are the 'number64' load using a
+;; R_PPC64_TOC16_DS against the .toc and the 'sin' external function
+;; address using a R_PPC64_REL24
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000006
+;; CHECK-NEXT:  'r_type', 0x0000003f
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x0000000a
+;; CHECK-NEXT:  'r_type', 0x0000000a
+
+;; The .opd entry for the 'access_int64' function creates 2 relocations:
+;; 1. A R_PPC64_ADDR64 against the .text segment plus addend (the function
+;    address itself);
+;; 2. And a R_PPC64_TOC against no symbol (the linker will replace for the
+;;    module's TOC base).
+;; CHECK:       '.rela.opd'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000002
+;; CHECK-NEXT:  'r_type', 0x00000026
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000000
+;; CHECK-NEXT:  'r_type', 0x00000033
+
+;; Finally the TOC creates the relocation for the 'number64'.
+;; CHECK:       '.rela.toc'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000026
+
+;; Check if the relocation references are for correct symbols.
+;; CHECK:       Symbol 7
+;; CHECK-NEXT:  'access_int64'
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  'number64'
+;; CHECK:       Symbol 10
+;; CHECK-NEXT:  'sin'
diff --git a/test/MC/PowerPC/ppc64-tls-relocs-01.ll b/test/MC/PowerPC/ppc64-tls-relocs-01.ll
new file mode 100644
index 0000000000000..5e37311075229
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-tls-relocs-01.ll
@@ -0,0 +1,28 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file should be in .s form, change when asm parser is available.
+
+@t = thread_local global i32 0, align 4
+
+define i32* @f() nounwind {
+entry:
+  ret i32* @t
+}
+
+;; Check for a pair of R_PPC64_TPREL16_HA / R_PPC64_TPREL16_LO relocs
+;; against the thread-local symbol 't'.
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000048
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000046
+
+;; Check that we got the correct symbol.
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  't'
+
diff --git a/test/MC/X86/intel-syntax-2.s b/test/MC/X86/intel-syntax-2.s
index ca4afc3173980..d6dbe152cd580 100644
--- a/test/MC/X86/intel-syntax-2.s
+++ b/test/MC/X86/intel-syntax-2.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown  %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=att %s | FileCheck %s
 
 	.intel_syntax
 _test:
 // CHECK:	movl	$257, -4(%rsp)
 	mov	DWORD PTR [RSP - 4], 257
-
+    .att_syntax
+// CHECK:	movl	$257, -4(%rsp)
+    movl $257, -4(%rsp)
diff --git a/test/MC/X86/x86-32-ms-inline-asm.s b/test/MC/X86/x86-32-ms-inline-asm.s
new file mode 100644
index 0000000000000..73d5878b41bc0
--- /dev/null
+++ b/test/MC/X86/x86-32-ms-inline-asm.s
@@ -0,0 +1,60 @@
+// RUN: llvm-mc -x86-asm-syntax=intel -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+mov eax, [ebx].0
+mov [ebx].4, ecx
+
+// CHECK: movl (%ebx), %eax
+// CHECK: encoding: [0x8b,0x03]
+// CHECK: movl %ecx, 4(%ebx)
+// CHECK: encoding: [0x89,0x4b,0x04]
+        
+_t21:                                   ## @t21
+// CHECK: t21
+	mov eax, [4*eax + 4]
+// CHECK: movl 4(,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x85,0x04,0x00,0x00,0x00]
+    mov eax, [4*eax][4]
+// CHECK: movl 4(,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x85,0x04,0x00,0x00,0x00]
+        
+	mov eax, [esi + eax]
+// CHECK: movl (%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x04,0x06]
+	mov eax, [esi][eax]
+// CHECK: movl (%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x04,0x06]
+        
+	mov eax, [esi + 4*eax]
+// CHECK: movl (%esi,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x86]
+	mov eax, [esi][4*eax]
+// CHECK: movl (%esi,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x86]
+
+    mov eax, [esi + eax + 4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi][eax + 4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi + eax][4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi][eax][4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+
+	mov eax, [esi + 2*eax + 4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi][2*eax + 4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi + 2*eax][4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi][2*eax][4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+
+	ret
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 6a2d5bba6b704..03cb62e7cba3e 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -1164,6 +1164,10 @@ xsetbv // CHECK: xsetbv # encoding: [0x0f,0x01,0xd1]
 // CHECK: encoding: [0x66,0x48,0x0f,0x6e,0xc7]
 	movd %rdi,%xmm0
 
+// CHECK: movd  %xmm0, %rax
+// CHECK: encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+        movd  %xmm0, %rax
+
 // CHECK: movntil %eax, (%rdi)
 // CHECK: encoding: [0x0f,0xc3,0x07]
 // CHECK: movntil
diff --git a/test/MC/X86/x86_64-rtm-encoding.s b/test/MC/X86/x86_64-rtm-encoding.s
new file mode 100644
index 0000000000000..44d6bacb7f32f
--- /dev/null
+++ b/test/MC/X86/x86_64-rtm-encoding.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: xbegin .L0
+// CHECK: encoding: [0xc7,0xf8,A,A,A,A]
+	xbegin .L0
+
+// CHECK: xend
+// CHECK: encoding: [0x0f,0x01,0xd5]
+	xend
+
+// CHECK: xabort
+// CHECK: encoding: [0xc6,0xf8,0x0d]
+	xabort $13
diff --git a/test/MC/X86/x86_nop.s b/test/MC/X86/x86_nop.s
new file mode 100644
index 0000000000000..396e3022ebec0
--- /dev/null
+++ b/test/MC/X86/x86_nop.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=generic %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i386 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i486 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i586 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium-mmx %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=geode %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i686 %s | llvm-objdump -d - | not FileCheck %s
+
+# CHECK-NOT: nop{{[lw]}}
+inc %eax
+.align 8
+inc %eax
diff --git a/test/Makefile b/test/Makefile
index 9ddfabfb9a8e7..810fdded465a6 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -29,11 +29,6 @@ else
 LIT_ARGS := -s -v
 endif
 
-# -jN causes crash on Cygwin's python.
-ifneq (,$(filter $(HOST_OS),Cygwin))
-  LIT_ARGS += -j1
-endif
-
 ifdef TESTSUITE
 LIT_TESTSUITE := $(TESTSUITE)
 CLEANED_TESTSUITE := $(patsubst %/,%,$(TESTSUITE))
@@ -122,6 +117,16 @@ else
 ENABLE_ASSERTIONS=1
 endif
 
+# Derive whether or not LTO is enabled by checking the extra options.
+LTO_IS_ENABLED := 0
+ifneq ($(findstring -flto,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+else
+ifneq ($(findstring -O4,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+endif
+endif
+
 lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
 	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp
@@ -131,9 +136,10 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
-	@$(ECHOPATH) s,@OCAMLOPT@,$(OCAMLOPT) -cc \\\\\"$(CXX_FOR_OCAMLOPT)\\\\\" -I $(LibDir)/ocaml,g >> lit.tmp
+	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
+	@$(ECHOPATH) s=@LTO_IS_ENABLED@=$(LTO_IS_ENABLED)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
diff --git a/test/Object/Inputs/dext-test.elf-mips64r2 b/test/Object/Inputs/dext-test.elf-mips64r2
new file mode 100644
index 0000000000000..59dbaef69a2d6
--- /dev/null
+++ b/test/Object/Inputs/dext-test.elf-mips64r2
diff --git a/test/Object/Inputs/relocations.elf-x86-64 b/test/Object/Inputs/relocations.elf-x86-64
new file mode 100644
index 0000000000000..6e340c7525430
--- /dev/null
+++ b/test/Object/Inputs/relocations.elf-x86-64
diff --git a/test/Object/Mips/feature.test b/test/Object/Mips/feature.test
new file mode 100644
index 0000000000000..e8da609746035
--- /dev/null
+++ b/test/Object/Mips/feature.test
@@ -0,0 +1,11 @@
+RUN: llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 %p/../Inputs/dext-test.elf-mips64r2 \
+RUN: | FileCheck %s
+
+CHECK: Disassembly of section .text:
+CHECK: .text:
+CHECK:        0:	08 00 e0 03                                  	jr	$ra
+CHECK:        4:	43 49 82 7c                                  	dext $2, $4, 5, 10
+CHECK:        8:	08 00 e0 03                                  	jr	$ra
+CHECK:        c:	83 28 82 7c                                  	dext $2, $4, 2, 6
+CHECK:       10:	08 00 e0 03                                  	jr	$ra
+CHECK:       14:	43 09 82 7c                                  	dext $2, $4, 5, 2
diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
new file mode 100644
index 0000000000000..149931749822b
--- /dev/null
+++ b/test/Object/Mips/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.test']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Mips' in targets:
+    config.unsupported = True
diff --git a/test/Object/nm-shared-object.test b/test/Object/nm-shared-object.test
index b361df5355534..a57b9401ad42d 100644
--- a/test/Object/nm-shared-object.test
+++ b/test/Object/nm-shared-object.test
@@ -1,15 +1,23 @@
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-i386 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-32
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-64
 
 ; Note: tls_sym should be 'D' (not '?'), but TLS is not
 ; yet recognized by ObjectFile.
 
-ELF: {{[0-9a-f]+}} A __bss_start
-ELF: {{[0-9a-f]+}} A _edata
-ELF: {{[0-9a-f]+}} A _end
-ELF: {{[0-9a-f]+}} B common_sym
-ELF: {{[0-9a-f]+}} D defined_sym
-ELF: {{[0-9a-f]+}} T global_func
-ELF:               ? tls_sym
+ELF-32: 0012c8 A __bss_start
+ELF-32: 0012c8 A _edata
+ELF-32: 0012cc A _end
+ELF-32: 0012c8 B common_sym
+ELF-32: 0012c4 D defined_sym
+ELF-32: 0001f0 T global_func
+ELF-32:        ? tls_sym
+
+ELF-64: 200454 A __bss_start
+ELF-64: 200454 A _edata
+ELF-64: 200458 A _end
+ELF-64: 200454 B common_sym
+ELF-64: 200450 D defined_sym
+ELF-64: 0002f0 T global_func
+ELF-64:        ? tls_sym
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index a394a23a7e05a..6d35a2651d7a8 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -9,6 +9,9 @@ RUN:              | FileCheck %s -check-prefix ELF-x86-64
 RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-hexagon \
 RUN:              | FileCheck %s -check-prefix ELF-hexagon
 
+RUN: llvm-objdump -r %p/Inputs/relocations.elf-x86-64 \
+RUN:              | FileCheck %s -check-prefix ELF-complex-x86-64
+
 COFF-i386: .text
 COFF-i386: IMAGE_REL_I386_DIR32 L_.str
 COFF-i386: IMAGE_REL_I386_REL32 _puts
@@ -36,3 +39,13 @@ ELF-hexagon: R_HEX_HI16 puts
 ELF-hexagon: R_HEX_LO16 puts
 ELF-hexagon: R_HEX_B15_PCREL testf
 ELF-hexagon: R_HEX_B22_PCREL puts
+
+ELF-complex-x86-64: .text
+ELF-complex-x86-64-NEXT: R_X86_64_8 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_16 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32S .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_64 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_PC32 .data-4-P
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+0
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+4
diff --git a/test/Object/objdump-symbol-table.test b/test/Object/objdump-symbol-table.test
index 989ec04a8ddc3..c94b077735508 100644
--- a/test/Object/objdump-symbol-table.test
+++ b/test/Object/objdump-symbol-table.test
@@ -4,6 +4,8 @@ RUN: llvm-objdump -t %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -t %p/Inputs/trivial-object-test.macho-i386 \
 RUN:              | FileCheck %s -check-prefix macho-i386
+RUN: llvm-objdump -t %p/Inputs/shared-object-test.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-shared
 
 COFF-i386: file format
 COFF-i386: SYMBOL TABLE:
@@ -31,3 +33,9 @@ macho-i386: SYMBOL TABLE:
 macho-i386: 00000000 g     F __TEXT,__text  00000024 _main
 macho-i386: 00000000         *UND*  00000000 _SomeOtherFunction
 macho-i386: 00000000         *UND*  00000000 _puts
+
+ELF-shared: shared-object-test.elf-i386:     file format
+ELF-shared: SYMBOL TABLE:
+ELF-shared: 00000200 l     F .text 00000003 local_func
+ELF-shared: 000012c4 g       .data 00000004 defined_sym
+ELF-shared: 000001f0 g     F .text 00000003 global_func
diff --git a/test/Other/FileCheck-space.txt b/test/Other/FileCheck-space.txt
new file mode 100644
index 0000000000000..6bbe5bc05ba75
--- /dev/null
+++ b/test/Other/FileCheck-space.txt
@@ -0,0 +1,9 @@
+RUN: printf "a\nb" | FileCheck %s -check-prefix=TEST1
+RUN: echo oo | FileCheck %s -check-prefix=TEST2
+
+Check that CHECK-NEXT without a space after the colon works.
+TEST1:a
+TEST1-NEXT:b
+
+Check that CHECK-NOT without a space after the colon works.
+TEST2-NOT:foo
diff --git a/test/Other/Inputs/llvm-cov.gcda b/test/Other/Inputs/llvm-cov.gcda
new file mode 100644
index 0000000000000..9ae2286ea2f4b
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcda
diff --git a/test/Other/Inputs/llvm-cov.gcno b/test/Other/Inputs/llvm-cov.gcno
new file mode 100644
index 0000000000000..25e202386a897
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcno
diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll
new file mode 100644
index 0000000000000..b8b3d0a902332
--- /dev/null
+++ b/test/Other/ResponseFile.ll
@@ -0,0 +1,9 @@
+; RUN: echo %s > %t.list
+; RUN: llvm-as @%t.list -o %t.bc
+; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
+
+; CHECK: T foobar
+
+define void @foobar() {
+  ret void
+}
diff --git a/test/Other/extract-alias.ll b/test/Other/extract-alias.ll
new file mode 100644
index 0000000000000..d5bab4b3f36b3
--- /dev/null
+++ b/test/Other/extract-alias.ll
@@ -0,0 +1,49 @@
+; RUN: llvm-extract -func foo -S < %s | FileCheck %s
+; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
+; RUN: llvm-extract -alias zeda0 -S < %s | FileCheck --check-prefix=ALIAS %s
+; RUN: llvm-extract -ralias .*bar -S < %s | FileCheck --check-prefix=ALIASRE %s
+
+; Both aliases should be converted to declarations
+; CHECK:      @zeda0 = external global i32
+; CHECK:      define i32* @foo() {
+; CHECK-NEXT:  call void @a0bar()
+; CHECK-NEXT:  ret i32* @zeda0
+; CHECK-NEXT: }
+; CHECK:      declare void @a0bar()
+
+; DELETE:      @zed = global i32 0
+; DELETE:      @zeda0 = alias i32* @zed
+; DELETE-NEXT: @a0foo = alias i32* ()* @foo
+; DELETE-NEXT: @a0a0bar = alias void ()* @a0bar
+; DELETE-NEXT: @a0bar = alias void ()* @bar
+; DELETE:      declare i32* @foo()
+; DELETE:      define void @bar() {
+; DELETE-NEXT:  %c = call i32* @foo()
+; DELETE-NEXT:  ret void
+; DELETE-NEXT: }
+
+; ALIAS: @zed = external global i32
+; ALIAS: @zeda0 = alias i32* @zed
+
+; ALIASRE: @a0a0bar = alias void ()* @a0bar
+; ALIASRE: @a0bar = alias void ()* @bar
+; ALIASRE: declare void @bar()
+
+@zed = global i32 0
+@zeda0 = alias i32* @zed
+
+@a0foo = alias i32* ()* @foo
+
+define i32* @foo() {
+  call void @a0bar()
+  ret i32* @zeda0
+}
+
+@a0a0bar = alias void ()* @a0bar
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  %c = call i32* @foo()
+  ret void
+}
diff --git a/test/Other/extract-weak-odr.ll b/test/Other/extract-weak-odr.ll
new file mode 100644
index 0000000000000..6618f58436451
--- /dev/null
+++ b/test/Other/extract-weak-odr.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-extract -func foo -S < %s | FileCheck %s
+; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
+
+; Test that we don't convert weak_odr to external definitions.
+
+; CHECK:      @bar = external global i32
+; CHECK:      define weak_odr i32* @foo() {
+; CHECK-NEXT:  ret i32* @bar
+; CHECK-NEXT: }
+
+; DELETE: @bar = weak_odr global i32 42
+; DELETE: declare i32* @foo()
+
+@bar = weak_odr global i32 42
+
+define weak_odr i32*  @foo() {
+  ret i32* @bar
+}
+
+define void @g() {
+  %c = call i32* @foo()
+  ret void
+}
diff --git a/test/Other/extract.ll b/test/Other/extract.ll
index 57573ed76f9a9..8b0c835d57466 100644
--- a/test/Other/extract.ll
+++ b/test/Other/extract.ll
@@ -7,18 +7,19 @@
 ; llvm-extract uses lazy bitcode loading, so make sure it correctly reads
 ; from bitcode files in addition to assembly files.
 
-; CHECK: define void @foo() {
+; CHECK: define hidden void @foo() {
 ; CHECK:   ret void
 ; CHECK: }
 
-; The linkonce_odr linkage for foo() should be changed to external linkage.
-; DELETE: declare void @foo()
+; The private linkage for foo() should be changed to external linkage and
+; hidden visibility added.
+; DELETE: declare hidden void @foo()
 ; DELETE: define void @bar() {
 ; DELETE:   call void @foo()
 ; DELETE:   ret void
 ; DELETE: }
 
-define linkonce_odr void @foo() {
+define private void @foo() {
   ret void
 }
 define void @bar() {
diff --git a/test/Other/link-opts.ll b/test/Other/link-opts.ll
new file mode 100644
index 0000000000000..8e58ac8a56837
--- /dev/null
+++ b/test/Other/link-opts.ll
@@ -0,0 +1,13 @@
+;RUN: opt -S -std-link-opts < %s | FileCheck %s
+; Simple test to check that -std-link-opts keeps only the main function.
+
+; CHECK-NOT: define
+; CHECK: define void @main
+; CHECK-NOT: define
+define void @main() {
+  ret void
+}
+
+define void @foo() {
+  ret void
+}
diff --git a/test/Other/lint.ll b/test/Other/lint.ll
index c84f56f8f694d..78bbbe9e6fa6e 100644
--- a/test/Other/lint.ll
+++ b/test/Other/lint.ll
@@ -9,8 +9,11 @@ declare void @has_noaliases(i32* noalias %p, i32* %q)
 declare void @one_arg(i32)
 
 @CG = constant i32 7
+@E = external global i8
 
 define i32 @foo() noreturn {
+  %buf = alloca i8
+  %buf2 = alloca {i8, i8}, align 2
 ; CHECK: Caller and callee calling convention differ
   call void @bar()
 ; CHECK: Null pointer dereference
@@ -26,8 +29,10 @@ define i32 @foo() noreturn {
 ; CHECK: Address one pointer dereference
   store i32 0, i32* inttoptr (i64 1 to i32*)
 ; CHECK: Memory reference address is misaligned
-  %x = inttoptr i32 1 to i32*
-  load i32* %x, align 4
+  store i8 0, i8* %buf, align 2
+; CHECK: Memory reference address is misaligned
+  %gep = getelementptr {i8, i8}* %buf2, i32 0, i32 1
+  store i8 0, i8* %gep, align 2
 ; CHECK: Division by zero
   %sd = sdiv i32 2, 0
 ; CHECK: Division by zero
@@ -75,6 +80,18 @@ define i32 @foo() noreturn {
 ; CHECK: Write to read-only memory
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i32 1, i1 0)
 
+; CHECK: Undefined behavior: Buffer overflow
+  %wider = bitcast i8* %buf to i16*
+  store i16 0, i16* %wider
+; CHECK: Undefined behavior: Buffer overflow
+  %inner = getelementptr {i8, i8}* %buf2, i32 0, i32 1
+  %wider2 = bitcast i8* %inner to i16*
+  store i16 0, i16* %wider2
+; CHECK: Undefined behavior: Buffer overflow
+  %before = getelementptr i8* %buf, i32 -1
+  %wider3 = bitcast i8* %before to i16*
+  store i16 0, i16* %wider3
+
   br label %next
 
 next:
@@ -84,6 +101,10 @@ next:
   ret i32 0
 
 foo:
+; CHECK-NOT: Undefined behavior: Buffer overflow
+; CHECK-NOT: Memory reference address is misaligned
+  %e = bitcast i8* @E to i64*
+  store i64 0, i64* %e
   %z = add i32 0, 0
 ; CHECK: unreachable immediately preceded by instruction without side effects
   unreachable
diff --git a/test/Other/lit.local.cfg b/test/Other/lit.local.cfg
index 19eebc0ac7ac3..269307724232d 100644
--- a/test/Other/lit.local.cfg
+++ b/test/Other/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll', '.c', '.cpp', '.txt']
diff --git a/test/Other/llvm-cov.test b/test/Other/llvm-cov.test
new file mode 100644
index 0000000000000..c0aa203e2c17a
--- /dev/null
+++ b/test/Other/llvm-cov.test
@@ -0,0 +1,3 @@
+PR11760
+RUN: llvm-cov -gcda=%S/Inputs/llvm-cov.gcda -gcno=%S/Inputs/llvm-cov.gcno
+
diff --git a/test/Other/llvm-nm-without-aliases.ll b/test/Other/llvm-nm-without-aliases.ll
new file mode 100644
index 0000000000000..9d9408c13b6df
--- /dev/null
+++ b/test/Other/llvm-nm-without-aliases.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s > %t
+; RUN: llvm-nm -without-aliases < %t | FileCheck %s
+; RUN: llvm-nm < %t | FileCheck --check-prefix=WITH %s
+
+; CHECK-NOT: T a0bar
+; CHECK-NOT: T a0foo
+; CHECK: T bar
+; CHECK: T foo
+
+; WITH: T a0bar
+; WITH: T a0foo
+; WITH: T bar
+; WITH: T foo
+
+@a0foo = alias void ()* @foo
+
+define void @foo() {
+  ret void
+}
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  ret void
+}
diff --git a/test/Other/spir_cc.ll b/test/Other/spir_cc.ll
new file mode 100644
index 0000000000000..ffc02945de4d2
--- /dev/null
+++ b/test/Other/spir_cc.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llvm-dis > %t1.ll
+; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
+; RUN: diff %t1.ll %t2.ll
+
+define spir_func void @foo() {
+        ret void
+}
+
+define spir_kernel void @bar() {
+        call spir_func void @foo( )
+        call spir_kernel void @bar( )
+        ret void
+}
diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 18de368af9f13..1d8d62329ae3d 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td
@@ -3,15 +3,59 @@
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
-// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, !if({ C:x{2} }, 0, 1), !if({ C:x{2} }, 1, 1), !if({ C:x{2} }, 0, 0), !if({ C:x{1} }, C:y{3}, 0), !if({ C:x{1} }, C:y{2}, 1), !if({ C:x{0} }, C:y{3}, C:z), !if({ C:x{0} }, C:y{2}, C:y{2}), !if({ C:x{0} }, C:y{1}, C:y{1}), !if({ C:x{0} }, C:y{0}, C:y{0}) };
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, !if({ C:y{3} }, 1, !if({ C:y{2} }, { C:x{0} }, !if({ C:y{1} }, { C:x{1} }, !if({ C:y{0} }, { C:x{2} }, ?)))){0}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){1}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){0}, !if({ C:x{2} }, 2, 6){2}, !if({ C:x{2} }, 2, 6){1}, !if({ C:x{2} }, 2, 6){0}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){1}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){0}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){3}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){2}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){1}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){0} };
 class C<bits<3> x, bits<4> y, bit z> {
   bits<16> n;
 
+  let n{11}  = !if(y{3}, 1,
+               !if(y{2}, x{0},
+               !if(y{1}, x{1},
+               !if(y{0}, x{2}, ?))));
+  let n{10-9}= !if(x{2}, y{3-2},
+               !if(x{1}, y{2-1},
+               !if(x{0}, y{1-0}, ?)));
   let n{8-6} = !if(x{2}, 0b010, 0b110);
   let n{5-4} = !if(x{1}, y{3-2}, {0, 1});
   let n{3-0} = !if(x{0}, y{3-0}, {z, y{2}, y{1}, y{0}});
 }
 
+def C1 : C<{1, 0, 1}, {0, 1, 0, 1}, 0>;
+def C2 : C<{0, 1, 0}, {1, 0, 1, 0}, 1>;
+def C3 : C<{0, 0, 0}, {1, 0, 1, 0}, 0>;
+def C4 : C<{0, 0, 0}, {0, 0, 0, 0}, 0>;
+
+// CHECK: def C1
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 };
+// CHECK: def C2
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0 };
+// CHECK: def C3
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, ?, ?, 1, 1, 0, 0, 1, 0, 0, 1, 0 };
+// CHECK: def C4
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, 1, 1, 0, 0, 1, 0, 0, 0, 0 };
+
+class S<int s> {
+  bits<2> val = !if(!eq(s, 8),  {0, 0},
+                !if(!eq(s, 16), 0b01,
+                !if(!eq(s, 32), 2,
+                !if(!eq(s, 64), {1, 1}, ?))));
+}
+
+def D8  : S<8>;
+def D16 : S<16>;
+def D32 : S<32>;
+def D64 : S<64>;
+def D128: S<128>;
+// CHECK: def D128
+// CHECK-NEXT: bits<2> val = { ?, ? };
+// CHECK: def D16
+// CHECK-NEXT: bits<2> val = { 0, 1 };
+// CHECK: def D32
+// CHECK-NEXT: bits<2> val = { 1, 0 };
+// CHECK: def D64
+// CHECK-NEXT: bits<2> val = { 1, 1 };
+// CHECK: def D8
+// CHECK-NEXT: bits<2> val = { 0, 0 };
+
 // CHECK:      def One
 // CHECK-NEXT: list<int> first = [1, 2, 3];
 // CHECK-NEXT: list<int> rest = [1, 2, 3];
diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
new file mode 100644
index 0000000000000..5f3e3dabf4d4a
--- /dev/null
+++ b/test/TableGen/list-element-bitref.td
@@ -0,0 +1,15 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class C<list<bits<8>> L> {
+  bits<2> V0 = L[0]{1-0};
+  bits<2> V1 = L[1]{3-2};
+  string V2 = !if(L[0]{0}, "Odd", "Even");
+}
+
+def c0 : C<[0b0101, 0b1010]>;
+
+// CHECK: def c0
+// CHECk-NEXT: bits<2> V0 = { 0, 1 };
+// CHECk-NEXT: bits<2> V1 = { 1, 0 };
+// CHECk-NEXT: string V2 = "Odd";
diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
new file mode 100644
index 0000000000000..7779b635e33cc
--- /dev/null
+++ b/test/TableGen/pr8330.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class Or4<bits<8> Val> {
+  bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
+}
+
+class Whatev<bits<8> x>;
+
+class Whatever<bits<8> x> {
+  bits<8> W = {x{0}, x{1}, x{2}, x{3}, x{4}, x{5}, x{6}, x{7} };
+}
+
+multiclass X<bits<8> BaseOpc> {
+ def bar : Whatev<Or4<BaseOpc>.V >;
+}
+
+multiclass Y<bits<8> BaseOpc> {
+ def foo : Whatever<Or4<BaseOpc>.V >;
+}
+
+defm a : X<4>;
+
+// CHECK: def abar
+
+defm b : Y<8>;
+
+// CHECK: def bfoo
+// CHECK-NEXT: bits<8> W = { 0, 0, 1, 1, 0, 0, 0, 0 };
diff --git a/test/Transforms/BBVectorize/X86/cmp-types.ll b/test/Transforms/BBVectorize/X86/cmp-types.ll
new file mode 100644
index 0000000000000..a4fcbb6048f58
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/cmp-types.ll
@@ -0,0 +1,16 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%"struct.btSoftBody" = type { float, float, float*, i8 }
+
+define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 {
+entry:
+  %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null
+  %cond16 = zext i1 %tobool15 to i32
+  %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null
+  %cond22 = zext i1 %tobool21 to i32
+  ret void
+; CHECK: @test1
+}
+
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll
new file mode 100644
index 0000000000000..493f23b098539
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/loop1.ll
@@ -0,0 +1,53 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; The second check covers the use of alias analysis (with loop unrolling).
+
+define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
+entry:
+  br label %for.body
+; CHECK: @test1
+; CHECK-UNRL: @test1
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8
+  %mul = fmul double %0, %0
+  %mul3 = fmul double %0, %1
+  %add = fadd double %mul, %mul3
+  %add4 = fadd double %1, %1
+  %add5 = fadd double %add4, %0
+  %mul6 = fmul double %0, %add5
+  %add7 = fadd double %add, %mul6
+  %mul8 = fmul double %1, %1
+  %add9 = fadd double %0, %0
+  %add10 = fadd double %add9, %0
+  %mul11 = fmul double %mul8, %add10
+  %add12 = fadd double %add7, %mul11
+  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+  store double %add12, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK-NOT: <2 x double>
+; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
+; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
+; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
+; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
+; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
+; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
+; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
+; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
+; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
+; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
+; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
+; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/BBVectorize/X86/sh-rec.ll b/test/Transforms/BBVectorize/X86/sh-rec.ll
new file mode 100644
index 0000000000000..1e0492c2a8c22
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec.ll
@@ -0,0 +1,54 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @ptoa() nounwind uwtable {
+entry:
+  %call = call i8* @malloc() nounwind
+  br i1 undef, label %return, label %if.end10
+
+if.end10:                                         ; preds = %entry
+  %incdec.ptr = getelementptr inbounds i8* %call, i64 undef
+  %call17 = call i32 @ptou() nounwind
+  %incdec.ptr26.1 = getelementptr inbounds i8* %incdec.ptr, i64 -2
+  store i8 undef, i8* %incdec.ptr26.1, align 1
+  %div27.1 = udiv i32 %call17, 100
+  %rem.2 = urem i32 %div27.1, 10
+  %add2230.2 = or i32 %rem.2, 48
+  %conv25.2 = trunc i32 %add2230.2 to i8
+  %incdec.ptr26.2 = getelementptr inbounds i8* %incdec.ptr, i64 -3
+  store i8 %conv25.2, i8* %incdec.ptr26.2, align 1
+  %incdec.ptr26.3 = getelementptr inbounds i8* %incdec.ptr, i64 -4
+  store i8 undef, i8* %incdec.ptr26.3, align 1
+  %div27.3 = udiv i32 %call17, 10000
+  %rem.4 = urem i32 %div27.3, 10
+  %add2230.4 = or i32 %rem.4, 48
+  %conv25.4 = trunc i32 %add2230.4 to i8
+  %incdec.ptr26.4 = getelementptr inbounds i8* %incdec.ptr, i64 -5
+  store i8 %conv25.4, i8* %incdec.ptr26.4, align 1
+  %div27.4 = udiv i32 %call17, 100000
+  %rem.5 = urem i32 %div27.4, 10
+  %add2230.5 = or i32 %rem.5, 48
+  %conv25.5 = trunc i32 %add2230.5 to i8
+  %incdec.ptr26.5 = getelementptr inbounds i8* %incdec.ptr, i64 -6
+  store i8 %conv25.5, i8* %incdec.ptr26.5, align 1
+  %incdec.ptr26.6 = getelementptr inbounds i8* %incdec.ptr, i64 -7
+  store i8 0, i8* %incdec.ptr26.6, align 1
+  %incdec.ptr26.7 = getelementptr inbounds i8* %incdec.ptr, i64 -8
+  store i8 undef, i8* %incdec.ptr26.7, align 1
+  %div27.7 = udiv i32 %call17, 100000000
+  %rem.8 = urem i32 %div27.7, 10
+  %add2230.8 = or i32 %rem.8, 48
+  %conv25.8 = trunc i32 %add2230.8 to i8
+  %incdec.ptr26.8 = getelementptr inbounds i8* %incdec.ptr, i64 -9
+  store i8 %conv25.8, i8* %incdec.ptr26.8, align 1
+  unreachable
+
+return:                                           ; preds = %entry
+  ret void
+; CHECK: @ptoa
+}
+
+declare noalias i8* @malloc() nounwind
+
+declare i32 @ptou()
diff --git a/test/Transforms/BBVectorize/X86/sh-rec2.ll b/test/Transforms/BBVectorize/X86/sh-rec2.ll
new file mode 100644
index 0000000000000..ef2239932fa1a
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec2.ll
@@ -0,0 +1,85 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+  %xmc = alloca [52 x i16], align 16
+  %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+  call void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i16* undef, i16* null, i16* undef, i16* undef, i16* undef, i16* %arraydecay5) nounwind
+  %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+  %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+  store i8 0, i8* %incdec.ptr136, align 1
+  %arrayidx162 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 11
+  %0 = load i16* %arrayidx162, align 2
+  %conv1631 = trunc i16 %0 to i8
+  %and164 = shl i8 %conv1631, 3
+  %shl165 = and i8 %and164, 56
+  %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+  store i8 %shl165, i8* %incdec.ptr157, align 1
+  %1 = load i16* inttoptr (i64 2 to i16*), align 2
+  %conv1742 = trunc i16 %1 to i8
+  %and175 = shl i8 %conv1742, 1
+  %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+  store i8 %and175, i8* %incdec.ptr172, align 1
+  %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+  store i8 0, i8* %incdec.ptr183, align 1
+  %arrayidx214 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 15
+  %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+  store i8 0, i8* %incdec.ptr199, align 1
+  %2 = load i16* %arrayidx214, align 2
+  %conv2223 = trunc i16 %2 to i8
+  %and223 = shl i8 %conv2223, 6
+  %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+  store i8 %and223, i8* %incdec.ptr220, align 1
+  %arrayidx240 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 19
+  %3 = load i16* %arrayidx240, align 2
+  %conv2414 = trunc i16 %3 to i8
+  %and242 = shl i8 %conv2414, 2
+  %shl243 = and i8 %and242, 28
+  %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+  store i8 %shl243, i8* %incdec.ptr235, align 1
+  %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+  store i8 0, i8* %incdec.ptr251, align 1
+  %arrayidx282 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 25
+  %4 = load i16* %arrayidx282, align 2
+  %conv2835 = trunc i16 %4 to i8
+  %and284 = and i8 %conv2835, 7
+  %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+  store i8 %and284, i8* %incdec.ptr272, align 1
+  %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+  store i8 0, i8* %incdec.ptr287, align 1
+  %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+  store i8 0, i8* %incdec.ptr298, align 1
+  %arrayidx319 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 26
+  %5 = load i16* %arrayidx319, align 4
+  %conv3206 = trunc i16 %5 to i8
+  %and321 = shl i8 %conv3206, 4
+  %shl322 = and i8 %and321, 112
+  %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+  store i8 %shl322, i8* %incdec.ptr314, align 1
+  %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+  %6 = load i16* %arrayidx340, align 2
+  %conv3417 = trunc i16 %6 to i8
+  %and342 = shl i8 %conv3417, 3
+  %shl343 = and i8 %and342, 56
+  %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+  store i8 %shl343, i8* %incdec.ptr335, align 1
+  %incdec.ptr366 = getelementptr inbounds i8* %c, i64 24
+  store i8 0, i8* %incdec.ptr350, align 1
+  %arrayidx381 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 36
+  %incdec.ptr387 = getelementptr inbounds i8* %c, i64 25
+  store i8 0, i8* %incdec.ptr366, align 1
+  %7 = load i16* %arrayidx381, align 8
+  %conv3898 = trunc i16 %7 to i8
+  %and390 = shl i8 %conv3898, 6
+  store i8 %and390, i8* %incdec.ptr387, align 1
+  unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-rec3.ll b/test/Transforms/BBVectorize/X86/sh-rec3.ll
new file mode 100644
index 0000000000000..fd2cc8bdd91c3
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec3.ll
@@ -0,0 +1,170 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+  %LARc28 = alloca [2 x i64], align 16
+  %LARc28.sub = getelementptr inbounds [2 x i64]* %LARc28, i64 0, i64 0
+  %tmpcast = bitcast [2 x i64]* %LARc28 to [8 x i16]*
+  %Nc = alloca [4 x i16], align 2
+  %Mc = alloca [4 x i16], align 2
+  %bc = alloca [4 x i16], align 2
+  %xmc = alloca [52 x i16], align 16
+  %arraydecay = bitcast [2 x i64]* %LARc28 to i16*
+  %arraydecay1 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 0
+  %arraydecay2 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 0
+  %arraydecay3 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 0
+  %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+  call void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i16* %arraydecay, i16* %arraydecay1, i16* %arraydecay2, i16* %arraydecay3, i16* undef, i16* %arraydecay5) nounwind
+  %0 = load i64* %LARc28.sub, align 16
+  %1 = trunc i64 %0 to i32
+  %conv1 = lshr i32 %1, 2
+  %and = and i32 %conv1, 15
+  %or = or i32 %and, 208
+  %conv6 = trunc i32 %or to i8
+  %incdec.ptr = getelementptr inbounds i8* %c, i64 1
+  store i8 %conv6, i8* %c, align 1
+  %conv84 = trunc i64 %0 to i8
+  %and9 = shl i8 %conv84, 6
+  %incdec.ptr15 = getelementptr inbounds i8* %c, i64 2
+  store i8 %and9, i8* %incdec.ptr, align 1
+  %2 = lshr i64 %0, 50
+  %shr226.tr = trunc i64 %2 to i8
+  %conv25 = and i8 %shr226.tr, 7
+  %incdec.ptr26 = getelementptr inbounds i8* %c, i64 3
+  store i8 %conv25, i8* %incdec.ptr15, align 1
+  %incdec.ptr42 = getelementptr inbounds i8* %c, i64 4
+  store i8 0, i8* %incdec.ptr26, align 1
+  %arrayidx52 = getelementptr inbounds [8 x i16]* %tmpcast, i64 0, i64 7
+  %3 = load i16* %arrayidx52, align 2
+  %conv537 = trunc i16 %3 to i8
+  %and54 = and i8 %conv537, 7
+  %incdec.ptr57 = getelementptr inbounds i8* %c, i64 5
+  store i8 %and54, i8* %incdec.ptr42, align 1
+  %incdec.ptr68 = getelementptr inbounds i8* %c, i64 6
+  store i8 0, i8* %incdec.ptr57, align 1
+  %4 = load i16* %arraydecay3, align 2
+  %conv748 = trunc i16 %4 to i8
+  %and75 = shl i8 %conv748, 5
+  %shl76 = and i8 %and75, 96
+  %incdec.ptr84 = getelementptr inbounds i8* %c, i64 7
+  store i8 %shl76, i8* %incdec.ptr68, align 1
+  %arrayidx94 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 1
+  %5 = load i16* %arrayidx94, align 2
+  %conv959 = trunc i16 %5 to i8
+  %and96 = shl i8 %conv959, 1
+  %shl97 = and i8 %and96, 14
+  %or103 = or i8 %shl97, 1
+  %incdec.ptr105 = getelementptr inbounds i8* %c, i64 8
+  store i8 %or103, i8* %incdec.ptr84, align 1
+  %arrayidx115 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 4
+  %6 = bitcast i16* %arrayidx115 to i32*
+  %7 = load i32* %6, align 8
+  %conv11610 = trunc i32 %7 to i8
+  %and117 = and i8 %conv11610, 7
+  %incdec.ptr120 = getelementptr inbounds i8* %c, i64 9
+  store i8 %and117, i8* %incdec.ptr105, align 1
+  %8 = lshr i32 %7, 16
+  %and12330 = shl nuw nsw i32 %8, 5
+  %and123 = trunc i32 %and12330 to i8
+  %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+  store i8 %and123, i8* %incdec.ptr120, align 1
+  %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+  store i8 0, i8* %incdec.ptr136, align 1
+  %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+  store i8 0, i8* %incdec.ptr157, align 1
+  %arrayidx173 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 1
+  %9 = load i16* %arrayidx173, align 2
+  %conv17412 = zext i16 %9 to i32
+  %and175 = shl nuw nsw i32 %conv17412, 1
+  %arrayidx177 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 1
+  %10 = load i16* %arrayidx177, align 2
+  %conv17826 = zext i16 %10 to i32
+  %shr17913 = lshr i32 %conv17826, 1
+  %and180 = and i32 %shr17913, 1
+  %or181 = or i32 %and175, %and180
+  %conv182 = trunc i32 %or181 to i8
+  %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+  store i8 %conv182, i8* %incdec.ptr172, align 1
+  %arrayidx188 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 1
+  %11 = load i16* %arrayidx188, align 2
+  %conv18914 = trunc i16 %11 to i8
+  %and190 = shl i8 %conv18914, 5
+  %shl191 = and i8 %and190, 96
+  %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+  store i8 %shl191, i8* %incdec.ptr183, align 1
+  %arrayidx209 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 14
+  %12 = load i16* %arrayidx209, align 4
+  %conv21015 = trunc i16 %12 to i8
+  %and211 = shl i8 %conv21015, 1
+  %shl212 = and i8 %and211, 14
+  %or218 = or i8 %shl212, 1
+  %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+  store i8 %or218, i8* %incdec.ptr199, align 1
+  %arrayidx225 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 16
+  %13 = bitcast i16* %arrayidx225 to i64*
+  %14 = load i64* %13, align 16
+  %conv22616 = trunc i64 %14 to i8
+  %and227 = shl i8 %conv22616, 3
+  %shl228 = and i8 %and227, 56
+  %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+  store i8 %shl228, i8* %incdec.ptr220, align 1
+  %15 = lshr i64 %14, 32
+  %and23832 = shl nuw nsw i64 %15, 5
+  %and238 = trunc i64 %and23832 to i8
+  %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+  store i8 %and238, i8* %incdec.ptr235, align 1
+  %arrayidx266 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 23
+  %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+  store i8 0, i8* %incdec.ptr251, align 1
+  %16 = load i16* %arrayidx266, align 2
+  %conv27418 = trunc i16 %16 to i8
+  %and275 = shl i8 %conv27418, 6
+  %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+  store i8 %and275, i8* %incdec.ptr272, align 1
+  %arrayidx288 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 2
+  %17 = load i16* %arrayidx288, align 2
+  %conv28919 = zext i16 %17 to i32
+  %and290 = shl nuw nsw i32 %conv28919, 1
+  %arrayidx292 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 2
+  %18 = load i16* %arrayidx292, align 2
+  %conv29327 = zext i16 %18 to i32
+  %shr29420 = lshr i32 %conv29327, 1
+  %and295 = and i32 %shr29420, 1
+  %or296 = or i32 %and290, %and295
+  %conv297 = trunc i32 %or296 to i8
+  %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+  store i8 %conv297, i8* %incdec.ptr287, align 1
+  %conv30021 = trunc i16 %18 to i8
+  %and301 = shl i8 %conv30021, 7
+  %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+  store i8 %and301, i8* %incdec.ptr298, align 1
+  %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+  store i8 0, i8* %incdec.ptr314, align 1
+  %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+  %19 = load i16* %arrayidx340, align 2
+  %conv34122 = trunc i16 %19 to i8
+  %and342 = shl i8 %conv34122, 3
+  %shl343 = and i8 %and342, 56
+  %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+  store i8 %shl343, i8* %incdec.ptr335, align 1
+  %arrayidx355 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 32
+  %20 = bitcast i16* %arrayidx355 to i32*
+  %21 = load i32* %20, align 16
+  %conv35623 = shl i32 %21, 2
+  %shl358 = and i32 %conv35623, 28
+  %22 = lshr i32 %21, 17
+  %and363 = and i32 %22, 3
+  %or364 = or i32 %shl358, %and363
+  %conv365 = trunc i32 %or364 to i8
+  store i8 %conv365, i8* %incdec.ptr350, align 1
+  unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-types.ll b/test/Transforms/BBVectorize/X86/sh-types.ll
new file mode 100644
index 0000000000000..0bcb714d5e652
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-types.ll
@@ -0,0 +1,25 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define <4 x float> @test7(<4 x float> %A1, <4 x float> %B1, double %C1, double %C2, double %D1, double %D2) {
+        %A2 = shufflevector <4 x float> %A1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+        %B2 = shufflevector <4 x float> %B1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+        %X1 = shufflevector <4 x float> %A2, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+        %X2 = shufflevector <4 x float> %B2, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+        %Y1 = shufflevector <2 x float> %X1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+        %Y2 = shufflevector <2 x float> %X2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+
+	%M1 = fsub double %C1, %D1
+	%M2 = fsub double %C2, %D2
+	%N1 = fmul double %M1, %C1
+	%N2 = fmul double %M2, %C2
+	%Z1 = fadd double %N1, %D1
+	%Z2 = fadd double %N2, %D2
+
+        %R = fmul <4 x float> %Y1, %Y2
+        ret <4 x float> %R
+; CHECK: @test7
+; CHECK-NOT: <8 x float>
+; CHECK: ret <4 x float>
+}
+
diff --git a/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
new file mode 100644
index 0000000000000..0124399bad9da
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+}
+
diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll
new file mode 100644
index 0000000000000..0113e38bb1c91
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple.ll
@@ -0,0 +1,103 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain
+define double @test1a(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%W1 = fadd double %Y1, %Z1
+	%W2 = fadd double %Y2, %Z2
+	%V1 = fadd double %W1, %Z1
+	%V2 = fadd double %W2, %Z2
+	%Q1 = fadd double %W1, %V1
+	%Q2 = fadd double %W2, %V2
+	%S1 = fadd double %W1, %Q1
+	%S2 = fadd double %W2, %Q2
+	%R  = fmul double %S1, %S2
+	ret double %R
+; CHECK: @test1a
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %W1 = fadd <2 x double> %Y1, %Z1
+; CHECK: %V1 = fadd <2 x double> %W1, %Z1
+; CHECK: %Q1 = fadd <2 x double> %W1, %V1
+; CHECK: %S1 = fadd <2 x double> %W1, %Q1
+; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0
+; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1
+; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic depth-4 chain (internal permutation)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%W1 = fadd double %Y2, %Z1
+	%W2 = fadd double %Y1, %Z2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test4
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain with shuffles
+define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
+	%X1 = sub <8 x i8> %A1, %B1
+	%X2 = sub <8 x i8> %A2, %B2
+	%Y1 = mul <8 x i8> %X1, %A1
+	%Y2 = mul <8 x i8> %X2, %A2
+	%Z1 = add <8 x i8> %Y1, %B1
+	%Z2 = add <8 x i8> %Y2, %B2
+        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
+        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
+	%R  = mul <8 x i8> %Q1, %Q2
+	ret <8 x i8> %R
+; CHECK: @test6
+; CHECK-NOT: sub <16 x i8>
+; CHECK: ret <8 x i8>
+}
+
diff --git a/test/Transforms/BBVectorize/X86/vs-cast.ll b/test/Transforms/BBVectorize/X86/vs-cast.ll
new file mode 100644
index 0000000000000..be3efca925b8f
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/vs-cast.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @main() nounwind uwtable {
+entry:
+  %0 = bitcast <2 x i64> undef to i128
+  %1 = bitcast <2 x i64> undef to i128
+  ret void
+; CHECK: @main
+}
+
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
index 32a91ceee0078..e8e82ce02479f 100644
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -107,6 +107,6 @@ done:
   ret void
 ; CHECK: @test1
 ; CHECK: go:
-; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
 ; FIXME: When tree pruning is deterministic, include the entire output.
 }
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
index 19eebc0ac7ac3..a8ad0f1a28b23 100644
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -1 +1,6 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index bebc91ad91a07..c22ea5852a1bd 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %mul = fmul double %0, %0
 ; CHECK: %mul3 = fmul double %0, %1
 ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
 ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
index d9945b563077e..aeaf98865bc97 100644
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -7,8 +7,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK-SL4: @test1
 ; CHECK-SL4-NOT: <2 x double>
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index 68449771436e6..ae1d63bfd852b 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -17,8 +17,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1,
 	ret double %R
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
@@ -43,8 +43,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 	ret double %R
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
@@ -68,8 +68,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 	ret double %R
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.powi.v2f64(<2 x double> %X1, i32 %P)
diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
index f992d4154779e..d46f7692b6d36 100644
--- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
@@ -2,6 +2,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
 
+; FIXME: re-enable this once pointer vectors work properly
+; XFAIL: *
+
 ; Simple 3-pair chain also with loads and stores (using ptrs and gep)
 define double @test1(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
 entry:
@@ -79,3 +82,53 @@ entry:
 ; CHECK-AO-NOT: <2 x
 }
 
+; Simple 3-pair chain with loads and stores (using ptrs and gep)
+; using pointer vectors.
+define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load <2 x i64*>* %a, align 8
+  %i1 = load <2 x i64*>* %b, align 8
+  %arrayidx3 = getelementptr inbounds <2 x i64*>* %a, i64 1
+  %i3 = load <2 x i64*>* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+  %i4 = load <2 x i64*>* %arrayidx4, align 8
+  %j1 = extractelement <2 x i64*> %i1, i32 0
+  %j4 = extractelement <2 x i64*> %i4, i32 0
+  %o1 = load i64* %j1, align 8
+  %o4 = load i64* %j4, align 8
+  %j0 = extractelement <2 x i64*> %i0, i32 0
+  %j3 = extractelement <2 x i64*> %i3, i32 0
+  %ptr0 = getelementptr inbounds i64* %j0, i64 %o1
+  %ptr3 = getelementptr inbounds i64* %j3, i64 %o4
+  %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0
+  %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1
+  %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0
+  %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1
+  store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8
+  %arrayidx5 = getelementptr inbounds <2 x i64*>* %c, i64 1
+  store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>*
+; CHECK: %i1 = load <2 x i64*>* %b, align 8
+; CHECK: %i0 = load <4 x i64*>* %i0.v.i0, align 8
+; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+; CHECK: %i4 = load <2 x i64*>* %arrayidx4, align 8
+; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0
+; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0
+; CHECK: %o1 = load i64* %j1, align 8
+; CHECK: %o4 = load i64* %j4, align 8
+; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
+; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
+; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK: %ptr0 = getelementptr inbounds <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2
+; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer
+; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>*
+; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO-NOT: <4 x
+}
+
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
index a5397eeb1f967..7dd77c933f6dc 100644
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -94,13 +94,13 @@ entry:
 ; CHECK-AO: @test3
 ; CHECK-AO: %i0 = load double* %a, align 8
 ; CHECK-AO: %i1 = load double* %b, align 8
-; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
-; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
 ; CHECK-AO: %i3 = load double* %arrayidx3, align 8
 ; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
 ; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
 ; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
 ; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
 ; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
@@ -108,3 +108,63 @@ entry:
 ; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
 ; CHECK-AO: ret void
 }
+
+; Simple 3-pair chain with loads and stores (unreachable)
+define void @test4(i1 %bool, double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  br i1 %bool, label %if.then1, label %if.end
+
+if.then1:
+  unreachable
+  br label %if.then
+
+if.then:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:
+  ret void
+; CHECK: @test4
+; CHECK-NOT: <2 x double>
+; CHECK-AO: @test4
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple 3-pair chain with loads and stores
+define void @test5(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  store double %mul, double* %c, align 4
+  ret void
+; CHECK: @test5
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 4
+; CHECK: ret void
+; CHECK-AO: @test5
+; CHECK-AO-NOT: <2 x double>
+}
+
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
index 325792a5dca15..15ecb597025aa 100644
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -33,8 +33,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK-NB: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
index 88eb9c90f7ee3..3527ae75b457d 100644
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -29,8 +29,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -40,12 +40,13 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
 	%Z1 = fadd double %Y2, %B1
 	%Z2 = fadd double %Y1, %B2
-; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+; CHECK: %Z1.v.i1.1 = insertelement <2 x double> undef, double %B2, i32 0
+; CHECK: %Z1.v.i1.2 = insertelement <2 x double> %Z1.v.i1.1, double %B1, i32 1
+; CHECK: %Z2 = fadd <2 x double> %Y1, %Z1.v.i1.2
 	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %Z2.v.r1 = extractelement <2 x double> %Z2, i32 0
+; CHECK: %Z2.v.r2 = extractelement <2 x double> %Z2, i32 1
+; CHECK: %R = fmul double %Z2.v.r2, %Z2.v.r1
 	ret double %R
 ; CHECK: ret double %R
 }
@@ -54,8 +55,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 define double @test3(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -79,8 +80,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2) {
 define double @test4(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test4
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -148,4 +149,51 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
 ; CHECK: ret <8 x i8> %R
 }
 
+; Basic depth-3 chain (flipped order)
+define double @test7(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test7
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z2 = fadd double %Y2, %B2
+	%Z1 = fadd double %Y1, %B1
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (subclass data)
+define i64 @test8(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+; CHECK: @test8
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+	%X1 = sub nsw i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = mul i64 %X1, %A1
+	%Y2 = mul i64 %X2, %A2
+; CHECK: %Y1 = mul <2 x i64> %X1, %X1.v.i0.2
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+	%R  = mul i64 %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+	ret i64 %R
+; CHECK: ret i64 %R
+}
 
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 74d80aa187290..6794288a0ef23 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,17 +1,24 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s 
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
+; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
 @g2 = constant double 1.0
+; { 0x7B, 0x06B1BFF8 }
 @g3 = constant {i64, i64} { i64 123, i64 112312312 }
 
 ; Simple load
 define i32 @test1() {
   %r = load i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0)
   ret i32 %r
-; CHECK: @test1
-; CHECK: ret i32 -559038737
+
+; 0xDEADBEEF
+; LE: @test1
+; LE: ret i32 -559038737
+
+; 0xDEADBEEF
+; BE: @test1
+; BE: ret i32 -559038737
 }
 
 ; PR3152
@@ -20,8 +27,13 @@ define i16 @test2() {
   %r = load i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*)
   ret i16 %r
 
-; CHECK: @test2
-; CHECK: ret i16 -16657 
+; 0xBEEF
+; LE: @test2
+; LE: ret i16 -16657
+
+; 0xDEAD
+; BE: @test2
+; BE: ret i16 -8531
 }
 
 ; Load of second 16 bits of 32-bit value.
@@ -29,16 +41,27 @@ define i16 @test3() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 1)
   ret i16 %r
 
-; CHECK: @test3
-; CHECK: ret i16 -8531
+; 0xDEAD
+; LE: @test3
+; LE: ret i16 -8531
+
+; 0xBEEF
+; BE: @test3
+; BE: ret i16 -16657
 }
 
 ; Load of 8 bit field + tail padding.
 define i16 @test4() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 2)
   ret i16 %r
-; CHECK: @test4
-; CHECK: ret i16 186
+
+; 0x00BA
+; LE: @test4
+; LE: ret i16 186
+
+; 0xBA00
+; BE: @test4
+; BE: ret i16 -17920
 }
 
 ; Load of double bits.
@@ -46,8 +69,13 @@ define i64 @test6() {
   %r = load i64* bitcast(double* @g2 to i64*)
   ret i64 %r
 
-; CHECK: @test6
-; CHECK: ret i64 4607182418800017408
+; 0x3FF_0000000000000
+; LE: @test6
+; LE: ret i64 4607182418800017408
+
+; 0x3FF_0000000000000
+; BE: @test6
+; BE: ret i64 4607182418800017408
 }
 
 ; Load of double bits.
@@ -55,8 +83,13 @@ define i16 @test7() {
   %r = load i16* bitcast(double* @g2 to i16*)
   ret i16 %r
 
-; CHECK: @test7
-; CHECK: ret i16 0
+; 0x0000
+; LE: @test7
+; LE: ret i16 0
+
+; 0x3FF0
+; BE: @test7
+; BE: ret i16 16368
 }
 
 ; Double load.
@@ -64,8 +97,11 @@ define double @test8() {
   %r = load double* bitcast({{i32,i8},i32}* @g1 to double*)
   ret double %r
 
-; CHECK: @test8
-; CHECK: ret double 0xBADEADBEEF
+; LE: @test8
+; LE: ret double 0xBADEADBEEF
+
+; BE: @test8
+; BE: ret double 0xDEADBEEFBA000000
 }
 
 
@@ -74,8 +110,13 @@ define i128 @test9() {
   %r = load i128* bitcast({i64, i64}* @g3 to i128*)
   ret i128 %r
 
-; CHECK: @test9
-; CHECK: ret i128 2071796475790618158476296315
+; 0x00000000_06B1BFF8_00000000_0000007B
+; LE: @test9
+; LE: ret i128 2071796475790618158476296315
+
+; 0x00000000_0000007B_00000000_06B1BFF8
+; BE: @test9
+; BE: ret i128 2268949521066387161080
 }
 
 ; vector load.
@@ -83,21 +124,30 @@ define <2 x i64> @test10() {
   %r = load <2 x i64>* bitcast({i64, i64}* @g3 to <2 x i64>*)
   ret <2 x i64> %r
 
-; CHECK: @test10
-; CHECK: ret <2 x i64> <i64 123, i64 112312312>
+; LE: @test10
+; LE: ret <2 x i64> <i64 123, i64 112312312>
+
+; BE: @test10
+; BE: ret <2 x i64> <i64 123, i64 112312312>
 }
 
 
 ; PR5287
+; { 0xA1, 0x08 }
 @g4 = internal constant { i8, i8 } { i8 -95, i8 8 }
 
 define i16 @test11() nounwind {
 entry:
   %a = load i16* bitcast ({ i8, i8 }* @g4 to i16*)
   ret i16 %a
-  
-; CHECK: @test11
-; CHECK: ret i16 2209
+
+; 0x08A1
+; LE: @test11
+; LE: ret i16 2209
+
+; 0xA108
+; BE: @test11
+; BE: ret i16 -24312
 }
 
 
@@ -107,8 +157,14 @@ entry:
 define i16 @test12() {
   %a = load i16* getelementptr inbounds ([3 x i16]* bitcast ([6 x i8]* @test12g to [3 x i16]*), i32 0, i64 1) 
   ret i16 %a
-; CHECK: @test12
-; CHECK: ret i16 98
+
+; 0x0062
+; LE: @test12
+; LE: ret i16 98
+
+; 0x6200
+; BE: @test12
+; BE: ret i16 25088
 }
 
 
@@ -117,8 +173,12 @@ define i16 @test12() {
 define i1 @test13() {
   %A = load i1* bitcast (i8* @g5 to i1*)
   ret i1 %A
-; CHECK: @test13
-; CHECK: ret i1 false
+
+; LE: @test13
+; LE: ret i1 false
+
+; BE: @test13
+; BE: ret i1 false
 }
 
 @g6 = constant [2 x i8*] [i8* inttoptr (i64 1 to i8*), i8* inttoptr (i64 2 to i8*)]
@@ -126,14 +186,22 @@ define i64 @test14() nounwind {
 entry:
   %tmp = load i64* bitcast ([2 x i8*]* @g6 to i64*)
   ret i64 %tmp
-; CHECK: @test14
-; CHECK: ret i64 1
+
+; LE: @test14
+; LE: ret i64 1
+
+; BE: @test14
+; BE: ret i64 1
 }
 
 define i64 @test15() nounwind {
 entry:
   %tmp = load i64* bitcast (i8** getelementptr inbounds ([2 x i8*]* @g6, i32 0, i64 1) to i64*)
   ret i64 %tmp
-; CHECK: @test15
-; CHECK: ret i64 2
+
+; LE: @test15
+; LE: ret i64 2
+
+; BE: @test15
+; BE: ret i64 2
 }
diff --git a/test/Transforms/CorrelatedValuePropagation/crash.ll b/test/Transforms/CorrelatedValuePropagation/crash.ll
index 80c43d0f1da51..9723d18252a7e 100644
--- a/test/Transforms/CorrelatedValuePropagation/crash.ll
+++ b/test/Transforms/CorrelatedValuePropagation/crash.ll
@@ -35,3 +35,28 @@ srf.exit.i:
 func_29.exit:
   ret void
 }
+
+; PR13972
+define void @test3() nounwind {
+for.body:
+  br label %return
+
+for.cond.i:                                       ; preds = %if.else.i, %for.body.i
+  %e.2.i = phi i32 [ %e.2.i, %if.else.i ], [ -8, %for.body.i ]
+  br i1 undef, label %return, label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i
+  switch i32 %e.2.i, label %for.cond3.i [
+    i32 -3, label %if.else.i
+    i32 0, label %for.cond.i
+  ]
+
+for.cond3.i:                                      ; preds = %for.cond3.i, %for.body.i
+  br label %for.cond3.i
+
+if.else.i:                                        ; preds = %for.body.i
+  br label %for.cond.i
+
+return:                                           ; preds = %for.cond.i, %for.body
+  ret void
+}
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
new file mode 100644
index 0000000000000..dcbfaaa3d77b8
--- /dev/null
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -0,0 +1,64 @@
+; RUN: opt %s -deadargelim -S | FileCheck %s
+; PR14016
+
+; Check that debug info metadata for subprograms stores pointers to
+; updated LLVM functions.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = global i32 0, align 4
+
+define void @_Z3runv() uwtable {
+entry:
+  call void @_ZN12_GLOBAL__N_18dead_argEPv(i8* null), !dbg !10
+  call void (...)* @_ZN12_GLOBAL__N_111dead_varargEz(), !dbg !12
+  ret void, !dbg !13
+}
+
+; Argument will be deleted
+define internal void @_ZN12_GLOBAL__N_18dead_argEPv(i8* %foo) nounwind uwtable {
+entry:
+  %0 = load i32* @x, align 4, !dbg !14
+  %inc = add nsw i32 %0, 1, !dbg !14
+  store i32 %inc, i32* @x, align 4, !dbg !14
+  ret void, !dbg !16
+}
+
+; Vararg will be deleted
+define internal void @_ZN12_GLOBAL__N_111dead_varargEz(...) nounwind uwtable {
+entry:
+  %0 = load i32* @x, align 4, !dbg !17
+  %inc = add nsw i32 %0, 1, !dbg !17
+  store i32 %inc, i32* @x, align 4, !dbg !17
+  ret void, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", metadata !"clang version 3.2 (trunk 165305)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/samsonov/tmp/clang-di/test.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !8, metadata !9}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"run", metadata !"run", metadata !"", metadata !6, i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [run]
+!6 = metadata !{i32 786473, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_vararg", metadata !"dead_vararg", metadata !"", metadata !6, i32 5, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (...)* @_ZN12_GLOBAL__N_111dead_varargEz, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [dead_vararg]
+
+; CHECK: metadata !"dead_vararg"{{.*}}void ()* @_ZN12_GLOBAL__N_111dead_varargEz
+
+!9 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_arg", metadata !"dead_arg", metadata !"", metadata !6, i32 4, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @_ZN12_GLOBAL__N_18dead_argEPv, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [dead_arg]
+
+; CHECK: metadata !"dead_arg"{{.*}}void ()* @_ZN12_GLOBAL__N_18dead_argEPv
+
+!10 = metadata !{i32 8, i32 14, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 8, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!12 = metadata !{i32 8, i32 27, metadata !11, null}
+!13 = metadata !{i32 8, i32 42, metadata !11, null}
+!14 = metadata !{i32 4, i32 28, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !9, i32 4, i32 26, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!16 = metadata !{i32 4, i32 33, metadata !15, null}
+!17 = metadata !{i32 5, i32 25, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !8, i32 5, i32 23, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!19 = metadata !{i32 5, i32 30, metadata !18, null}
diff --git a/test/Transforms/DeadStoreElimination/libcalls.ll b/test/Transforms/DeadStoreElimination/libcalls.ll
new file mode 100644
index 0000000000000..4639c0bc96289
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/libcalls.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -basicaa -dse < %s | FileCheck %s
+
+declare i8* @strcpy(i8* %dest, i8* %src) nounwind
+define void @test1(i8* %src) {
+; CHECK: @test1
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strcpy
+  %call = call i8* @strcpy(i8* %dest, i8* %src)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strncpy(i8* %dest, i8* %src, i32 %n) nounwind
+define void @test2(i8* %src) {
+; CHECK: @test2
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strncpy
+  %call = call i8* @strncpy(i8* %dest, i8* %src, i32 12)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strcat(i8* %dest, i8* %src) nounwind
+define void @test3(i8* %src) {
+; CHECK: @test3
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strcat
+  %call = call i8* @strcat(i8* %dest, i8* %src)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strncat(i8* %dest, i8* %src, i32 %n) nounwind
+define void @test4(i8* %src) {
+; CHECK: @test4
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strncat
+  %call = call i8* @strncat(i8* %dest, i8* %src, i32 12)
+; CHECK: ret void
+  ret void
+}
+
+define void @test5(i8* nocapture %src) {
+; CHECK: @test5
+  %dest = alloca [100 x i8], align 16
+  %arraydecay = getelementptr inbounds [100 x i8]* %dest, i64 0, i64 0
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %src)
+; CHECK: %call = call i8* @strcpy
+  %arrayidx = getelementptr inbounds i8* %call, i64 10
+  store i8 97, i8* %arrayidx, align 1
+  ret void
+}
+
+declare void @user(i8* %p)
+define void @test6(i8* %src) {
+; CHECK: @test6
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK: @strcpy
+  %call = call i8* @strcpy(i8* %dest, i8* %src)
+; CHECK: @user
+  call void @user(i8* %dest)
+; CHECK: ret void
+  ret void
+}
+
diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index 7a8cdd531b55b..e0eb90af94373 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll
@@ -310,3 +310,17 @@ define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind {
   store i32 %c, i32* %4, align 4
   ret void
 }
+
+; Check another case like PR13547 where strdup is not like malloc.
+; CHECK: @test25
+; CHECK: load i8
+; CHECK: store i8 0
+; CHECK: store i8 %tmp
+define i8* @test25(i8* %p) nounwind {
+  %p.4 = getelementptr i8* %p, i64 4
+  %tmp = load i8* %p.4, align 1
+  store i8 0, i8* %p.4, align 1
+  %q = call i8* @strdup(i8* %p) nounwind optsize
+  store i8 %tmp, i8* %p.4, align 1
+  ret i8* %q
+}
diff --git a/test/Transforms/EarlyCSE/commute.ll b/test/Transforms/EarlyCSE/commute.ll
new file mode 100644
index 0000000000000..f84a7dd1aae95
--- /dev/null
+++ b/test/Transforms/EarlyCSE/commute.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -S -early-cse | FileCheck %s
+
+; CHECK: @test1
+define void @test1(float %A, float %B, float* %PA, float* %PB) {
+  ; CHECK-NEXT: fadd
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fadd float %A, %B
+  store float %C, float* %PA
+  %D = fadd float %B, %A
+  store float %D, float* %PB
+  ret void
+}
+
+; CHECK: @test2
+define void @test2(float %A, float %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: fcmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fcmp eq float %A, %B
+  store i1 %C, i1* %PA
+  %D = fcmp eq float %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test3
+define void @test3(float %A, float %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: fcmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fcmp uge float %A, %B
+  store i1 %C, i1* %PA
+  %D = fcmp ule float %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test4
+define void @test4(i32 %A, i32 %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: icmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = icmp eq i32 %A, %B
+  store i1 %C, i1* %PA
+  %D = icmp eq i32 %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test5
+define void @test5(i32 %A, i32 %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: icmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = icmp sgt i32 %A, %B
+  store i1 %C, i1* %PA
+  %D = icmp slt i32 %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 31eae256c6efc..4a8c8e4589c8f 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -163,3 +163,39 @@ entry:
   ret i8 %1
 }
 
+
+; Test that a GEP in an unreachable block with the following form doesn't crash
+; GVN:
+;
+;    %x = gep %some.type %x, ...
+
+%struct.type = type { i64, i32, i32 }
+
+define fastcc void @func() nounwind uwtable ssp align 2 {
+entry:
+  br label %reachable.bb
+
+;; Unreachable code.
+
+unreachable.bb:
+  %gep.val = getelementptr inbounds %struct.type* %gep.val, i64 1
+  br i1 undef, label %u2.bb, label %u1.bb
+
+u1.bb:
+  %tmp1 = getelementptr inbounds %struct.type* %gep.val, i64 0, i32 0
+  store i64 -1, i64* %tmp1, align 8
+  br label %unreachable.bb
+
+u2.bb:
+  %0 = load i32* undef, align 4
+  %conv.i.i.i.i.i = zext i32 %0 to i64
+  br label %u2.bb
+
+;; Reachable code.
+
+reachable.bb:
+  br label %r1.bb
+
+r1.bb:
+  br label %u2.bb
+}
diff --git a/test/Transforms/GVN/malloc-load-removal.ll b/test/Transforms/GVN/malloc-load-removal.ll
new file mode 100644
index 0000000000000..66b6929d3038d
--- /dev/null
+++ b/test/Transforms/GVN/malloc-load-removal.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
+; PR13694
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare i8* @malloc(i64) nounwind
+
+define noalias i8* @test() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @malloc(i64 100) nounwind
+  %0 = load i8* %call, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK: @test
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS: @test
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}
diff --git a/test/Transforms/GVN/pr14166.ll b/test/Transforms/GVN/pr14166.ll
new file mode 100644
index 0000000000000..9f47e464265b6
--- /dev/null
+++ b/test/Transforms/GVN/pr14166.ll
@@ -0,0 +1,27 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+target datalayout = "e-p:32:32:32"
+target triple = "i386-pc-linux-gnu"
+define <2 x i32> @test1() {
+  %v1 = alloca <2 x i32>
+  call void @anything(<2 x i32>* %v1)
+  %v2 = load <2 x i32>* %v1
+  %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+  %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+  store <2 x i8*> %v3, <2 x i8*>* %v4
+  %v5 = load <2 x i32>* %v1
+  ret <2 x i32> %v5
+; CHECK: @test1
+; CHECK: %v1 = alloca <2 x i32>
+; CHECK: call void @anything(<2 x i32>* %v1)
+; CHECK: %v2 = load <2 x i32>* %v1
+; CHECK: %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+; CHECK: %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+; CHECK: store <2 x i8*> %v3, <2 x i8*>* %v4
+; CHECK: %1 = ptrtoint <2 x i8*> %v3 to <2 x i32>
+; CHECK: %2 = bitcast <2 x i32> %1 to i64
+; CHECK: %3 = bitcast i64 %2 to <2 x i32>
+; CHECK: ret <2 x i32> %3
+}
+
+declare void @anything(<2 x i32>*)
+
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index e7641691264c1..72fa819d1c73a 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -basicaa -gvn -S -die | FileCheck %s
-
-; 32-bit little endian target.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
@@ -318,7 +316,7 @@ define i8 @coerce_offset_nonlocal0(i32* %P, i1 %cond) {
   %P4 = getelementptr i8* %P3, i32 2
   br i1 %cond, label %T, label %F
 T:
-  store i32 42, i32* %P
+  store i32 57005, i32* %P
   br label %Cont
   
 F:
diff --git a/test/Transforms/GlobalOpt/blockaddress.ll b/test/Transforms/GlobalOpt/blockaddress.ll
new file mode 100644
index 0000000000000..13da76299d5db
--- /dev/null
+++ b/test/Transforms/GlobalOpt/blockaddress.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+@x = internal global i8* zeroinitializer
+
+define void @f() {
+; CHECK: @f
+
+; Check that we don't hit an assert in Constant::IsThreadDependent()
+; when storing this blockaddress into a global.
+
+  store i8* blockaddress(@g, %here), i8** @x, align 8
+  ret void
+}
+
+define void @g() {
+; CHECK: @g
+
+here:
+  ret void
+}
diff --git a/test/Transforms/GlobalOpt/load-store-global.ll b/test/Transforms/GlobalOpt/load-store-global.ll
index f824b2c11cbf9..25a53370fa091 100644
--- a/test/Transforms/GlobalOpt/load-store-global.ll
+++ b/test/Transforms/GlobalOpt/load-store-global.ll
@@ -1,15 +1,38 @@
-; RUN: opt < %s -globalopt -S | not grep G
+; RUN: opt < %s -globalopt -S | FileCheck %s
 
 @G = internal global i32 17             ; <i32*> [#uses=3]
+; CHECK-NOT: @G
 
 define void @foo() {
         %V = load i32* @G               ; <i32> [#uses=1]
         store i32 %V, i32* @G
         ret void
+; CHECK: @foo
+; CHECK-NEXT: ret void
 }
 
 define i32 @bar() {
         %X = load i32* @G               ; <i32> [#uses=1]
         ret i32 %X
+; CHECK: @bar
+; CHECK-NEXT: ret i32 17
+}
+
+@a = internal global i64* null, align 8
+; CHECK-NOT: @a
+
+; PR13968
+define void @qux() nounwind {
+  %b = bitcast i64** @a to i8*
+  %g = getelementptr i64** @a, i32 1
+  %cmp = icmp ne i8* null, %b
+  %cmp2 = icmp eq i8* null, %b
+  %cmp3 = icmp eq i64** null, %g
+  store i64* inttoptr (i64 1 to i64*), i64** @a, align 8
+  %l = load i64** @a, align 8
+  ret void
+; CHECK: @qux
+; CHECK-NOT: store
+; CHECK-NOT: load
 }
 
diff --git a/test/Transforms/GlobalOpt/tls.ll b/test/Transforms/GlobalOpt/tls.ll
new file mode 100644
index 0000000000000..7a410e5ed20b9
--- /dev/null
+++ b/test/Transforms/GlobalOpt/tls.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+declare void @wait()
+declare void @signal()
+declare void @start_thread(void ()*)
+
+@x = internal thread_local global [100 x i32] zeroinitializer, align 16
+@ip = internal global i32* null, align 8
+
+; PR14309: GlobalOpt would think that the value of @ip is always the address of
+; x[1]. However, that address is different for different threads so @ip cannot
+; be replaced with a constant.
+
+define i32 @f() {
+entry:
+  ; Set @ip to point to x[1] for thread 1.
+  store i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), i32** @ip, align 8
+
+  ; Run g on a new thread.
+  tail call void @start_thread(void ()* @g) nounwind
+  tail call void @wait() nounwind
+
+  ; Reset x[1] for thread 1.
+  store i32 0, i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), align 4
+
+  ; Read the value of @ip, which now points at x[1] for thread 2.
+  %0 = load i32** @ip, align 8
+
+  %1 = load i32* %0, align 4
+  ret i32 %1
+
+; CHECK: @f
+; Make sure that the load from @ip hasn't been removed.
+; CHECK: load i32** @ip
+; CHECK: ret
+}
+
+define internal void @g() nounwind uwtable {
+entry:
+  ; Set @ip to point to x[1] for thread 2.
+  store i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), i32** @ip, align 8
+
+  ; Store 50 in x[1] for thread 2.
+  store i32 50, i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), align 4
+
+  tail call void @signal() nounwind
+  ret void
+
+; CHECK: @g
+; Make sure that the store to @ip hasn't been removed.
+; CHECK: store {{.*}} @ip
+; CHECK: ret
+}
diff --git a/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll b/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
index 708a961272b5a..0c88e83975c1c 100644
--- a/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
+++ b/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
@@ -39,11 +39,11 @@
 	%"struct.llvm::SymbolTable" = type opaque
 	%"struct.llvm::SymbolTableListTraits<llvm::Argument,llvm::Function,llvm::Function,llvm::ilist_traits<llvm::Argument> >" = type { %"struct.llvm::Function"*, %"struct.llvm::Function"* }
 	%"struct.llvm::SymbolTableListTraits<llvm::Instruction,llvm::BasicBlock,llvm::Function,llvm::ilist_traits<llvm::Instruction> >" = type { %"struct.llvm::Function"*, %"struct.llvm::BasicBlock"* }
-	%"struct.llvm::TargetData" = type { %"struct.llvm::FunctionPass", i1, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%"struct.llvm::DataLayout" = type { %"struct.llvm::FunctionPass", i1, i8, i8, i8, i8, i8, i8, i8, i8 }
 	%"struct.llvm::TargetFrameInfo" = type { i32 (...)**, i32, i32, i32 }
 	%"struct.llvm::TargetInstrDescriptor" = type { i8*, i32, i32, i32, i1, i32, i32, i32, i32, i32, i32*, i32* }
 	%"struct.llvm::TargetInstrInfo" = type { i32 (...)**, %"struct.llvm::TargetInstrDescriptor"*, i32, i32 }
-	%"struct.llvm::TargetMachine" = type { i32 (...)**, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >", %"struct.llvm::TargetData", %"struct.llvm::IntrinsicLowering"* }
+	%"struct.llvm::TargetMachine" = type { i32 (...)**, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >", %"struct.llvm::DataLayout", %"struct.llvm::IntrinsicLowering"* }
 	%"struct.llvm::TargetRegClassInfo" = type { i32 (...)**, i32, i32, i32 }
 	%"struct.llvm::TargetRegInfo" = type { i32 (...)**, %"struct.std::vector<const llvm::TargetRegClassInfo*,std::allocator<const llvm::TargetRegClassInfo*> >", %"struct.llvm::TargetMachine"* }
 	%"struct.llvm::Type" = type { %"struct.llvm::Value", i32, i32, i1, i32, %"struct.llvm::Type"*, %"struct.std::vector<llvm::PATypeHandle,std::allocator<llvm::PATypeHandle> >" }
diff --git a/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
new file mode 100644
index 0000000000000..5c478669d2986
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; PR12627
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %phi1 = phi i1 [ false, %entry ], [ %cmpa, %for.body ]
+  %phi2 = phi i1 [ false, %entry ], [ %cmpb, %for.body ]
+  %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @aux(i1 %phi1, i1 %phi2) nounwind
+  %cmpa = icmp sgt i32 %i.07, 200
+  %cmpb = icmp sgt i32 %i.07, 100
+  %inc = add nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK: @test1
+; CHECK-NOT: phi i1
+; CHECK: call void @aux(i1 false, i1 false)
+}
+
+declare void @aux(i1, i1)
diff --git a/test/Transforms/IndVarSimplify/crash.ll b/test/Transforms/IndVarSimplify/crash.ll
index 3335be781dfc6..1b702a3b1a3c9 100644
--- a/test/Transforms/IndVarSimplify/crash.ll
+++ b/test/Transforms/IndVarSimplify/crash.ll
@@ -87,3 +87,47 @@ entry:
 main.f.exit:                                      ; preds = %"3.i"
   unreachable
 }
+
+
+; PR13967
+
+define void @f() nounwind ssp {
+bb:
+  br label %bb4
+
+bb4:
+  %tmp = phi i64 [ %tmp5, %bb7 ], [ undef, %bb ]
+  %tmp5 = add nsw i64 %tmp, 1
+  %extract.t1 = trunc i64 %tmp5 to i32
+  br i1 false, label %bb6, label %bb7
+
+bb6:
+  br label %bb7
+
+bb7:
+  %.off0 = phi i32 [ undef, %bb6 ], [ %extract.t1, %bb4 ]
+  %tmp8 = icmp eq i32 %.off0, 0
+  br i1 %tmp8, label %bb9, label %bb4
+
+bb9:
+  ret void
+}
+
+; PR12536
+define void @fn1() noreturn nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+  %b.0 = phi i32 [ undef, %entry ], [ %conv, %for.end ]
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.cond
+  %c.0 = phi i32 [ %b.0, %for.cond1 ], [ 0, %for.cond ]
+  br i1 undef, label %for.cond1, label %for.end
+
+for.end:                                          ; preds = %for.cond1
+  %cmp2 = icmp slt i32 %c.0, 1
+  %conv = zext i1 %cmp2 to i32
+  br label %for.cond
+}
diff --git a/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
index 953bbdff5c62d..5dca712646657 100644
--- a/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -106,3 +106,106 @@ loop:
 return:
   ret void
 }
+
+; PR14432
+; Indvars should not turn the second loop into an infinite one.
+
+; CHECK: @func_11
+; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK-NOT: br i1 true, label %noassert68, label %unrolledend
+
+define i32 @func_11() nounwind uwtable {
+entry:
+  br label %forcond
+
+forcond:                                          ; preds = %noassert, %entry
+  %__key6.0 = phi i32 [ 2, %entry ], [ %tmp37, %noassert ]
+  %tmp5 = icmp slt i32 %__key6.0, 10
+  br i1 %tmp5, label %noassert, label %forcond38.preheader
+
+forcond38.preheader:                              ; preds = %forcond
+  br label %forcond38
+
+noassert:                                         ; preds = %forbody
+  %tmp13 = sdiv i32 -32768, %__key6.0
+  %tmp2936 = shl i32 %tmp13, 24
+  %sext23 = shl i32 %tmp13, 24
+  %tmp32 = icmp eq i32 %tmp2936, %sext23
+  %tmp37 = add i32 %__key6.0, 1
+  br i1 %tmp32, label %forcond, label %assert33
+
+assert33:                                         ; preds = %noassert
+  tail call void @llvm.trap()
+  unreachable
+
+forcond38:                                        ; preds = %noassert68, %forcond38.preheader
+  %__key8.0 = phi i32 [ %tmp81, %noassert68 ], [ 2, %forcond38.preheader ]
+  %tmp46 = icmp slt i32 %__key8.0, 10
+  br i1 %tmp46, label %noassert68, label %unrolledend
+
+noassert68:                                       ; preds = %forbody39
+  %tmp57 = sdiv i32 -32768, %__key8.0
+  %sext34 = shl i32 %tmp57, 16
+  %sext21 = shl i32 %tmp57, 16
+  %tmp76 = icmp eq i32 %sext34, %sext21
+  %tmp81 = add i32 %__key8.0, 1
+  br i1 %tmp76, label %forcond38, label %assert77
+
+assert77:                                         ; preds = %noassert68
+  tail call void @llvm.trap()
+  unreachable
+
+unrolledend:                                      ; preds = %forcond38
+  ret i32 0
+}
+
+declare void @llvm.trap() noreturn nounwind
+
+; In this case the second loop only has a single iteration, fold the header away
+; CHECK: @func_12
+; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK: br i1 true, label %noassert68, label %unrolledend
+define i32 @func_12() nounwind uwtable {
+entry:
+  br label %forcond
+
+forcond:                                          ; preds = %noassert, %entry
+  %__key6.0 = phi i32 [ 2, %entry ], [ %tmp37, %noassert ]
+  %tmp5 = icmp slt i32 %__key6.0, 10
+  br i1 %tmp5, label %noassert, label %forcond38.preheader
+
+forcond38.preheader:                              ; preds = %forcond
+  br label %forcond38
+
+noassert:                                         ; preds = %forbody
+  %tmp13 = sdiv i32 -32768, %__key6.0
+  %tmp2936 = shl i32 %tmp13, 24
+  %sext23 = shl i32 %tmp13, 24
+  %tmp32 = icmp eq i32 %tmp2936, %sext23
+  %tmp37 = add i32 %__key6.0, 1
+  br i1 %tmp32, label %forcond, label %assert33
+
+assert33:                                         ; preds = %noassert
+  tail call void @llvm.trap()
+  unreachable
+
+forcond38:                                        ; preds = %noassert68, %forcond38.preheader
+  %__key8.0 = phi i32 [ %tmp81, %noassert68 ], [ 2, %forcond38.preheader ]
+  %tmp46 = icmp slt i32 %__key8.0, 10
+  br i1 %tmp46, label %noassert68, label %unrolledend
+
+noassert68:                                       ; preds = %forbody39
+  %tmp57 = sdiv i32 -32768, %__key8.0
+  %sext34 = shl i32 %tmp57, 16
+  %sext21 = shl i32 %tmp57, 16
+  %tmp76 = icmp ne i32 %sext34, %sext21
+  %tmp81 = add i32 %__key8.0, 1
+  br i1 %tmp76, label %forcond38, label %assert77
+
+assert77:                                         ; preds = %noassert68
+  tail call void @llvm.trap()
+  unreachable
+
+unrolledend:                                      ; preds = %forcond38
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index bfdd000e38eb3..507f695e67c5c 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -199,7 +199,6 @@ entry:
 ; back to the loop iv.
 ;
 ; CHECK: loop:
-; CHECK: phi i32
 ; CHECK-NOT: phi
 ; CHECK: exit:
 loop:
diff --git a/test/Transforms/IndVarSimplify/verify-scev.ll b/test/Transforms/IndVarSimplify/verify-scev.ll
new file mode 100644
index 0000000000000..019f5830d520b
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/verify-scev.ll
@@ -0,0 +1,421 @@
+; RUN: opt < %s -S -indvars -verify-scev
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @test1() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 false, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  %cmp272 = icmp eq i32 undef, undef
+  br i1 %cmp272, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  %k.029 = phi i32 [ 1, %for.body409.lr.ph ], [ %inc559, %for.inc558 ]
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  %inc559 = add nsw i32 %k.029, 1
+  %cmp407 = icmp sgt i32 %inc559, undef
+  br i1 %cmp407, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test2() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  %0 = load i32* undef, align 4
+  br i1 undef, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  %add406 = add i32 0, %0
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  %k.029 = phi i32 [ 1, %for.body409.lr.ph ], [ %inc559, %for.inc558 ]
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  %inc559 = add nsw i32 %k.029, 1
+  %cmp407 = icmp sgt i32 %inc559, %add406
+  br i1 %cmp407, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test3() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  br i1 undef, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  br i1 undef, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test4() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %if.end8, label %if.else
+
+if.else:                                          ; preds = %entry
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %entry
+  br i1 undef, label %if.end26, label %if.else22
+
+if.else22:                                        ; preds = %if.end8
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else22, %if.end8
+  br i1 undef, label %if.end35, label %if.else31
+
+if.else31:                                        ; preds = %if.end26
+  br label %if.end35
+
+if.end35:                                         ; preds = %if.else31, %if.end26
+  br i1 undef, label %for.end226, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end35
+  br label %for.body48
+
+for.body48:                                       ; preds = %for.inc221, %for.body.lr.ph
+  br i1 undef, label %for.inc221, label %for.body65.lr.ph
+
+for.body65.lr.ph:                                 ; preds = %for.body48
+  %0 = load i32* undef, align 4
+  br label %for.body65.us
+
+for.body65.us:                                    ; preds = %for.inc219.us, %for.body65.lr.ph
+  %k.09.us = phi i32 [ %inc.us, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
+  %idxprom66.us = sext i32 %k.09.us to i64
+  br i1 undef, label %for.inc219.us, label %if.end72.us
+
+if.end72.us:                                      ; preds = %for.body65.us
+  br i1 undef, label %if.end93.us, label %if.then76.us
+
+if.then76.us:                                     ; preds = %if.end72.us
+  br label %if.end93.us
+
+if.end93.us:                                      ; preds = %if.then76.us, %if.end72.us
+  br i1 undef, label %if.end110.us, label %for.inc219.us
+
+if.end110.us:                                     ; preds = %if.end93.us
+  br i1 undef, label %for.inc219.us, label %for.body142.us
+
+for.body142.us:                                   ; preds = %for.cond139.loopexit.us, %if.end110.us
+  br label %for.cond152.us
+
+for.cond152.us:                                   ; preds = %for.cond152.us, %for.body142.us
+  br i1 undef, label %for.cond139.loopexit.us, label %for.cond152.us
+
+for.inc219.us:                                    ; preds = %for.cond139.loopexit.us, %if.end110.us, %if.end93.us, %for.body65.us
+  %inc.us = add nsw i32 %k.09.us, 1
+  %cmp64.us = icmp sgt i32 %inc.us, %0
+  br i1 %cmp64.us, label %for.inc221, label %for.body65.us
+
+for.cond139.loopexit.us:                          ; preds = %for.cond152.us
+  br i1 undef, label %for.inc219.us, label %for.body142.us
+
+for.inc221:                                       ; preds = %for.inc219.us, %for.body48
+  br label %for.body48
+
+for.end226:                                       ; preds = %if.end35
+  ret void
+}
diff --git a/test/Transforms/Inline/recursive.ll b/test/Transforms/Inline/recursive.ll
new file mode 100644
index 0000000000000..5fe8d1639ca36
--- /dev/null
+++ b/test/Transforms/Inline/recursive.ll
@@ -0,0 +1,38 @@
+; RUN: opt %s -inline -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin10.0"
+
+; rdar://10853263
+
+; Make sure that the callee is still here.
+; CHECK: define i32 @callee
+define i32 @callee(i32 %param) {
+ %yyy = alloca [100000 x i8]
+ %r = bitcast [100000 x i8]* %yyy to i8*
+ call void @foo2(i8* %r)
+ ret i32 4
+}
+
+; CHECK: define i32 @caller
+; CHECK-NEXT: entry:
+; CHECK-NOT: alloca
+; CHECK: ret
+define i32 @caller(i32 %param) {
+entry:
+  %t = call i32 @foo(i32 %param)
+  %cmp = icmp eq i32 %t, -1
+  br i1 %cmp, label %exit, label %cont
+
+cont:
+  %r = call i32 @caller(i32 %t)
+  %f = call i32 @callee(i32 %r)
+  br label %cont
+exit:
+  ret i32 4
+}
+
+declare void @foo2(i8* %in)
+
+declare i32 @foo(i32 %param)
+
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
index 73e5a6653e802..18aab7f27efd1 100644
--- a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -1,12 +1,14 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
 ; PR13442
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-
 @test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
 
 define i64 @foo() {
   %ret = load i64* bitcast (i8* getelementptr (i8* bitcast ([4 x i32]* @test to i8*), i64 2) to i64*), align 1
   ret i64 %ret
-  ; CHECK: ret i64 844424930263040
+  ; 0x00030000_00020000 in [01 00/00 00 02 00 00 00 03 00/00 00 04 00 00 00]
+  ; LE: ret i64 844424930263040
+  ; 0x00000200_00000300 in [00 00/00 01 00 00 00 02 00 00/00 03 00 00 00 04]
+  ; BE: ret i64 281474976841728
 }
diff --git a/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
new file mode 100644
index 0000000000000..4efaf8c172556
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; rdar://12182093
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @udiv400
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @udiv400(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv400_no
+; CHECK: ashr
+; CHECK: div
+; CHECK: ret
+define i32 @udiv400_no(i32 %x) {
+entry:
+  %div = ashr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+; CHECK: @sdiv400_yes
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @sdiv400_yes(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  ; The sign bits of both operands are zero (i.e. we can prove they are
+  ; unsigned inputs), turn this into a udiv.
+  ; Next, optimize this just like sdiv.
+  %div1 = sdiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv_i80
+; CHECK: udiv i80 %x, 400
+; CHECK: ret
+define i80 @udiv_i80(i80 %x) {
+  %div = lshr i80 %x, 2
+  %div1 = udiv i80 %div, 100
+  ret i80 %div1
+}
+
+define i32 @no_crash_notconst_udiv(i32 %x, i32 %notconst) {
+  %div = lshr i32 %x, %notconst
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
diff --git a/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
new file mode 100644
index 0000000000000..ba025e92b0104
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; When merging zero sized alloca check that requested alignments of the allocas
+; are obeyed.
+
+@x = global i8* null, align 8
+@y = global i8* null, align 8
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @f
+; CHECK-NEXT: alloca [0 x i8], align 1024
+; CHECK-NOT: alloca
+; CHECK: ret void
+define void @f() {
+  %1 = alloca [0 x i8], align 1
+  %2 = alloca [0 x i8], align 1024
+  %3 = getelementptr inbounds [0 x i8]* %1, i64 0, i64 0
+  %4 = getelementptr inbounds [0 x i8]* %2, i64 0, i64 0
+  store i8* %3, i8** @x, align 8
+  store i8* %4, i8** @y, align 8
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll b/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll
new file mode 100644
index 0000000000000..4cd60b42fbe12
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Check we don't crash due to lack of target data.
+
+@G = constant [100 x i8] zeroinitializer
+
+declare void @bar(i8*)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @test() {
+; CHECK: @test
+; CHECK: llvm.memcpy
+; CHECK: ret void
+  %A = alloca [100 x i8]
+  %a = getelementptr inbounds [100 x i8]* %A, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* getelementptr inbounds ([100 x i8]* @G, i64 0, i32 0), i64 100, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll b/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
new file mode 100644
index 0000000000000..20ea282687422
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -instcombine -S
+
+; Make sure that we don't crash when optimizing the vectors of pointers.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.hoge = type { double*, double*, double*, double** }
+
+define void @widget(%struct.hoge* nocapture %arg) nounwind uwtable ssp {
+bb:
+  %tmp = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 0
+  br i1 undef, label %bb1, label %bb17
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  br label %bb17
+
+bb3:                                              ; preds = %bb1
+  %tmp4 = bitcast double** %tmp to <2 x double*>*
+  %tmp5 = load <2 x double*>* %tmp4, align 8
+  %tmp6 = ptrtoint <2 x double*> %tmp5 to <2 x i64>
+  %tmp7 = sub <2 x i64> zeroinitializer, %tmp6
+  %tmp8 = ashr exact <2 x i64> %tmp7, <i64 3, i64 3>
+  %tmp9 = extractelement <2 x i64> %tmp8, i32 0
+  %tmp10 = add nsw i64 undef, %tmp9
+  br i1 undef, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb3
+  br label %bb13
+
+bb12:                                             ; preds = %bb3
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb11
+  br i1 undef, label %bb16, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br i1 undef, label %bb16, label %bb15
+
+bb15:                                             ; preds = %bb14
+  br label %bb16
+
+bb16:                                             ; preds = %bb15, %bb14, %bb13
+  unreachable
+
+bb17:                                             ; preds = %bb2, %bb
+  ret void
+}
diff --git a/test/Transforms/InstCombine/align-addr.ll b/test/Transforms/InstCombine/align-addr.ll
index 27916b9860307..4ea1bd9beb3b5 100644
--- a/test/Transforms/InstCombine/align-addr.ll
+++ b/test/Transforms/InstCombine/align-addr.ll
@@ -58,3 +58,19 @@ define double @test2(double* %p, double %n) nounwind {
   store double %n, double* %p
   ret double %t
 }
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+declare void @use(i8*)
+
+%struct.s = type { i32, i32, i32, i32 }
+
+define void @test3(%struct.s* sret %a4) {
+; Check that the alignment is bumped up the alignment of the sret type.
+; CHECK: @test3
+  %a4.cast = bitcast %struct.s* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a4.cast, i8 0, i64 16, i32 1, i1 false)
+; CHECK: call void @llvm.memset.p0i8.i64(i8* %a4.cast, i8 0, i64 16, i32 4, i1 false)
+  call void @use(i8* %a4.cast)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index 50e03479f650b..68a671cec88af 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -94,3 +94,19 @@ entry:
   tail call void @f(i32* %b)
   ret void
 }
+
+; PR14371
+%opaque_type = type opaque
+%real_type = type { { i32, i32* } }
+
+@opaque_global = external constant %opaque_type, align 4
+
+define void @test7() {
+entry:
+  %0 = alloca %real_type, align 4
+  %1 = bitcast %real_type* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* bitcast (%opaque_type* @opaque_global to i8*), i32 8, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/Transforms/InstCombine/and-fcmp.ll b/test/Transforms/InstCombine/and-fcmp.ll
index 838c2f73fb910..40c44c09a8c01 100644
--- a/test/Transforms/InstCombine/and-fcmp.ll
+++ b/test/Transforms/InstCombine/and-fcmp.ll
@@ -10,7 +10,7 @@ define zeroext i8 @t1(float %x, float %y) nounwind {
 ; CHECK: fcmp oeq float %x, %y
 ; CHECK-NOT: fcmp ueq float %x, %y
 ; CHECK-NOT: fcmp ord float %x, %y
-; CHECK-NOW: and
+; CHECK-NOT: and
 }
 
 define zeroext i8 @t2(float %x, float %y) nounwind {
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 56e5ca3ff7209..b4eb69d4363dd 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -694,3 +694,209 @@ define i1 @test67(i1 %a, i32 %b) {
 ; CHECK: @test67
 ; CHECK: ret i1 false
 }
+
+%s = type { i32, i32, i32 }
+
+define %s @test68(%s *%p, i64 %i) {
+; CHECK: @test68
+  %o = mul i64 %i, 12
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr %s*
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define double @test69(double *%p, i64 %i) {
+; CHECK: @test69
+  %o = shl nsw i64 %i, 3
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define %s @test70(%s *%p, i64 %i) {
+; CHECK: @test70
+  %o = mul nsw i64 %i, 36
+; CHECK-NEXT: mul nsw i64 %i, 3
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds %s*
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define double @test71(double *%p, i64 %i) {
+; CHECK: @test71
+  %o = shl i64 %i, 5
+; CHECK-NEXT: shl i64 %i, 2
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %o
+; CHECK-NEXT: getelementptr double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test72(double *%p, i32 %i) {
+; CHECK: @test72
+  %so = mul nsw i32 %i, 8
+  %o = sext i32 %so to i64
+; CHECK-NEXT: sext i32 %i to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test73(double *%p, i128 %i) {
+; CHECK: @test73
+  %lo = mul nsw i128 %i, 8
+  %o = trunc i128 %lo to i64
+; CHECK-NEXT: trunc i128 %i to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test74(double *%p, i64 %i) {
+; CHECK: @test74
+  %q = bitcast double* %p to i64*
+  %pp = getelementptr inbounds i64* %q, i64 %i
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i64* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define i32* @test75(i32* %p, i32 %x) {
+; CHECK: @test75
+  %y = shl i32 %x, 3
+; CHECK-NEXT: shl i32 %x, 3
+  %z = sext i32 %y to i64
+; CHECK-NEXT: sext i32 %y to i64
+  %q = bitcast i32* %p to i8*
+  %r = getelementptr i8* %q, i64 %z
+  %s = bitcast i8* %r to i32*
+  ret i32* %s
+}
+
+define %s @test76(%s *%p, i64 %i, i64 %j) {
+; CHECK: @test76
+  %o = mul i64 %i, 12
+  %o2 = mul nsw i64 %o, %j
+; CHECK-NEXT: %o2 = mul i64 %i, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o2
+; CHECK-NEXT: getelementptr %s* %p, i64 %o2
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define %s @test77(%s *%p, i64 %i, i64 %j) {
+; CHECK: @test77
+  %o = mul nsw i64 %i, 36
+  %o2 = mul nsw i64 %o, %j
+; CHECK-NEXT: %o = mul nsw i64 %i, 3
+; CHECK-NEXT: %o2 = mul nsw i64 %o, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o2
+; CHECK-NEXT: getelementptr inbounds %s* %p, i64 %o2
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
+; CHECK: @test78
+  %a = mul nsw i32 %k, 36
+; CHECK-NEXT: mul nsw i32 %k, 3
+  %b = mul nsw i32 %a, %l
+; CHECK-NEXT: mul nsw i32 %a, %l
+  %c = sext i32 %b to i128
+; CHECK-NEXT: sext i32 %b to i128
+  %d = mul nsw i128 %c, %m
+; CHECK-NEXT: mul nsw i128 %c, %m
+  %e = mul i128 %d, %n
+; CHECK-NEXT: mul i128 %d, %n
+  %f = trunc i128 %e to i64
+; CHECK-NEXT: trunc i128 %e to i64
+  %g = mul nsw i64 %f, %i
+; CHECK-NEXT: mul i64 %f, %i
+  %h = mul nsw i64 %g, %j
+; CHECK-NEXT: mul i64 %g, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %h
+; CHECK-NEXT: getelementptr %s* %p, i64 %h
+  %r = bitcast i8* %pp to %s*
+  %load = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %load
+; CHECK-NEXT: ret %s
+}
+
+define %s @test79(%s *%p, i64 %i, i32 %j) {
+; CHECK: @test79
+  %a = mul nsw i64 %i, 36
+; CHECK: mul nsw i64 %i, 36
+  %b = trunc i64 %a to i32
+  %c = mul i32 %b, %j
+  %q = bitcast %s* %p to i8*
+; CHECK: bitcast
+  %pp = getelementptr inbounds i8* %q, i32 %c
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+  ret %s %l
+}
+
+define double @test80([100 x double]* %p, i32 %i) {
+; CHECK: @test80
+  %tmp = mul nsw i32 %i, 8
+; CHECK-NEXT: sext i32 %i to i64
+  %q = bitcast [100 x double]* %p to i8*
+  %pp = getelementptr i8* %q, i32 %tmp
+; CHECK-NEXT: getelementptr [100 x double]*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test81(double *%p, float %f) {
+  %i = fptosi float %f to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %i
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+  ret double %l
+}
diff --git a/test/Transforms/InstCombine/disable-simplify-libcalls.ll b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
new file mode 100644
index 0000000000000..d81e9ae5bd732
--- /dev/null
+++ b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
@@ -0,0 +1,236 @@
+; Test that -disable-simplify-libcalls is wired up correctly.
+;
+; RUN: opt < %s -instcombine -disable-simplify-libcalls -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str  = constant [1 x i8] zeroinitializer, align 1
+@.str1 = constant [13 x i8] c"hello, world\00", align 1
+@.str2 = constant [4 x i8] c"foo\00", align 1
+@.str3 = constant [4 x i8] c"bar\00", align 1
+@.str4 = constant [6 x i8] c"123.4\00", align 1
+@.str5 = constant [5 x i8] c"1234\00", align 1
+@empty = constant [1 x i8] c"\00", align 1
+
+declare double @ceil(double)
+declare double @copysign(double, double)
+declare double @cos(double)
+declare double @fabs(double)
+declare double @floor(double)
+declare i8* @strcat(i8*, i8*)
+declare i8* @strncat(i8*, i8*, i32)
+declare i8* @strchr(i8*, i32)
+declare i8* @strrchr(i8*, i32)
+declare i32 @strcmp(i8*, i8*)
+declare i32 @strncmp(i8*, i8*, i64)
+declare i8* @strcpy(i8*, i8*)
+declare i8* @stpcpy(i8*, i8*)
+declare i8* @strncpy(i8*, i8*, i64)
+declare i64 @strlen(i8*)
+declare i8* @strpbrk(i8*, i8*)
+declare i64 @strspn(i8*, i8*)
+declare double @strtod(i8*, i8**)
+declare float @strtof(i8*, i8**)
+declare x86_fp80 @strtold(i8*, i8**)
+declare i64 @strtol(i8*, i8**, i32)
+declare i64 @strtoll(i8*, i8**, i32)
+declare i64 @strtoul(i8*, i8**, i32)
+declare i64 @strtoull(i8*, i8**, i32)
+declare i64 @strcspn(i8*, i8*)
+
+define double @t1(double %x) {
+; CHECK: @t1
+  %ret = call double @ceil(double %x)
+  ret double %ret
+; CHECK: call double @ceil
+}
+
+define double @t2(double %x, double %y) {
+; CHECK: @t2
+  %ret = call double @copysign(double %x, double %y)
+  ret double %ret
+; CHECK: call double @copysign
+}
+
+define double @t3(double %x) {
+; CHECK: @t3
+  %call = call double @cos(double %x)
+  ret double %call
+; CHECK: call double @cos
+}
+
+define double @t4(double %x) {
+; CHECK: @t4
+  %ret = call double @fabs(double %x)
+  ret double %ret
+; CHECK: call double @fabs
+}
+
+define double @t5(double %x) {
+; CHECK: @t5
+  %ret = call double @floor(double %x)
+  ret double %ret
+; CHECK: call double @floor
+}
+
+define i8* @t6(i8* %x) {
+; CHECK: @t6
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strcat(i8* %x, i8* %empty)
+  ret i8* %ret
+; CHECK: call i8* @strcat
+}
+
+define i8* @t7(i8* %x) {
+; CHECK: @t7
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strncat(i8* %x, i8* %empty, i32 1)
+  ret i8* %ret
+; CHECK: call i8* @strncat
+}
+
+define i8* @t8() {
+; CHECK: @t8
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strchr
+}
+
+define i8* @t9() {
+; CHECK: @t9
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strrchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strrchr
+}
+
+define i32 @t10() {
+; CHECK: @t10
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strcmp(i8* %x, i8* %y)
+  ret i32 %ret
+; CHECK: call i32 @strcmp
+}
+
+define i32 @t11() {
+; CHECK: @t11
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strncmp(i8* %x, i8* %y, i64 3)
+  ret i32 %ret
+; CHECK: call i32 @strncmp
+}
+
+define i8* @t12(i8* %x) {
+; CHECK: @t12
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strcpy
+}
+
+define i8* @t13(i8* %x) {
+; CHECK: @t13
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @stpcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @stpcpy
+}
+
+define i8* @t14(i8* %x) {
+; CHECK: @t14
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strncpy(i8* %x, i8* %y, i64 3)
+  ret i8* %ret
+; CHECK: call i8* @strncpy
+}
+
+define i64 @t15() {
+; CHECK: @t15
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i64 @strlen(i8* %x)
+  ret i64 %ret
+; CHECK: call i64 @strlen
+}
+
+define i8* @t16(i8* %x) {
+; CHECK: @t16
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strpbrk(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strpbrk
+}
+
+define i64 @t17(i8* %x) {
+; CHECK: @t17
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i64 @strspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strspn
+}
+
+define double @t18(i8** %y) {
+; CHECK: @t18
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call double @strtod(i8* %x, i8** %y)
+  ret double %ret
+; CHECK: call double @strtod
+}
+
+define float @t19(i8** %y) {
+; CHECK: @t19
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call float @strtof(i8* %x, i8** %y)
+  ret float %ret
+; CHECK: call float @strtof
+}
+
+define x86_fp80 @t20(i8** %y) {
+; CHECK: @t20
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call x86_fp80 @strtold(i8* %x, i8** %y)
+  ret x86_fp80 %ret
+; CHECK: call x86_fp80 @strtold
+}
+
+define i64 @t21(i8** %y) {
+; CHECK: @t21
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtol(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtol
+}
+
+define i64 @t22(i8** %y) {
+; CHECK: @t22
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoll(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoll
+}
+
+define i64 @t23(i8** %y) {
+; CHECK: @t23
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoul(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoul
+}
+
+define i64 @t24(i8** %y) {
+; CHECK: @t24
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoull(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoull
+}
+
+define i64 @t25(i8* %y) {
+; CHECK: @t25
+  %x = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i64 @strcspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strcspn
+}
diff --git a/test/Transforms/InstCombine/div-shift.ll b/test/Transforms/InstCombine/div-shift.ll
index a07f3ea949142..e0372ebac184f 100644
--- a/test/Transforms/InstCombine/div-shift.ll
+++ b/test/Transforms/InstCombine/div-shift.ll
@@ -21,3 +21,17 @@ define i64 @t2(i64 %x, i32 %y) nounwind  {
   %3 = udiv i64 %x, %2
   ret i64 %3
 }
+
+; PR13250
+define i64 @t3(i64 %x, i32 %y) nounwind  {
+; CHECK: t3
+; CHECK-NOT: udiv
+; CHECK-NEXT: %1 = add i32 %y, 2
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = lshr i64 %x, %2
+; CHECK-NEXT: ret i64 %3
+  %1 = shl i32 4, %y
+  %2 = zext i32 %1 to i64
+  %3 = udiv i64 %x, %2
+  ret i64 %3
+}
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index d08cbf574a231..376fa079d24c2 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -54,9 +54,8 @@ define i1 @test7(float %x) nounwind readnone ssp noredzone {
   %ext = fpext float %x to ppc_fp128
   %cmp = fcmp ogt ppc_fp128 %ext, 0xM00000000000000000000000000000000
   ret i1 %cmp
-; Can't convert ppc_fp128
 ; CHECK: @test7
-; CHECK-NEXT: fpext float %x to ppc_fp128
+; CHECK-NEXT: fcmp ogt float %x, 0.000000e+00
 }
 
 define float @test8(float %x) nounwind readnone optsize ssp {
@@ -69,3 +68,93 @@ define float @test8(float %x) nounwind readnone optsize ssp {
 ; CHECK: @test8
 ; CHECK-NEXT: fcmp olt float %x, 0.000000e+00
 }
+
+declare double @fabs(double) nounwind readnone
+
+define i32 @test9(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp olt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test9
+; CHECK-NOT: fabs
+; CHECK: ret i32 0
+}
+
+define i32 @test10(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ole double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test10
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
+define i32 @test11(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ogt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test11
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
+define i32 @test12(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp oge double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test12
+; CHECK-NOT: fabs
+; CHECK: fcmp ord double %a, 0.000000e+00
+}
+
+define i32 @test13(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp une double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test13
+; CHECK-NOT: fabs
+; CHECK: fcmp une double %a, 0.000000e+00
+}
+
+define i32 @test14(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp oeq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test14
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
+define i32 @test15(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp one double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test15
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
+define i32 @test16(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test16
+; CHECK-NOT: fabs
+; CHECK: fcmp ueq double %a, 0.000000e+00
+}
+
+; Don't crash.
+define i32 @test17(double %a, double (double)* %p) nounwind {
+  %call = tail call double %p(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/Transforms/InstCombine/fold-vector-select.ll b/test/Transforms/InstCombine/fold-vector-select.ll
index 3f22522a6ce48..2cb970bf41774 100644
--- a/test/Transforms/InstCombine/fold-vector-select.ll
+++ b/test/Transforms/InstCombine/fold-vector-select.ll
@@ -1,13 +1,148 @@
 ; RUN: opt < %s -instcombine -S | not grep select
 
-define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) {
- %r = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> zeroinitializer
- %g = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 6, i32 9, i32 1>
- %b = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>,  <4 x i32> zeroinitializer, <4 x i32> <i32 7, i32 1, i32 4, i32 9>
- %a = select <4 x i1> zeroinitializer,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 8, i32 5>
- store <4 x i32> %r, <4 x i32>* %A
- store <4 x i32> %g, <4 x i32>* %B
- store <4 x i32> %b, <4 x i32>* %C
- store <4 x i32> %a, <4 x i32>* %D
+define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D,
+                 <4 x i32> *%E, <4 x i32> *%F, <4 x i32> *%G, <4 x i32> *%H,
+                 <4 x i32> *%I, <4 x i32> *%J, <4 x i32> *%K, <4 x i32> *%L,
+                 <4 x i32> *%M, <4 x i32> *%N, <4 x i32> *%O, <4 x i32> *%P,
+                 <4 x i32> *%Q, <4 x i32> *%R, <4 x i32> *%S, <4 x i32> *%T,
+                 <4 x i32> *%U, <4 x i32> *%V, <4 x i32> *%W, <4 x i32> *%X,
+                 <4 x i32> *%Y, <4 x i32> *%Z, <4 x i32> *%BA, <4 x i32> *%BB,
+                 <4 x i32> *%BC, <4 x i32> *%BD, <4 x i32> *%BE, <4 x i32> *%BF,
+                 <4 x i32> *%BG, <4 x i32> *%BH, <4 x i32> *%BI, <4 x i32> *%BJ,
+                 <4 x i32> *%BK, <4 x i32> *%BL, <4 x i32> *%BM, <4 x i32> *%BN,
+                 <4 x i32> *%BO, <4 x i32> *%BP, <4 x i32> *%BQ, <4 x i32> *%BR,
+                 <4 x i32> *%BS, <4 x i32> *%BT, <4 x i32> *%BU, <4 x i32> *%BV,
+                 <4 x i32> *%BW, <4 x i32> *%BX, <4 x i32> *%BY, <4 x i32> *%BZ,
+                 <4 x i32> *%CA, <4 x i32> *%CB, <4 x i32> *%CC, <4 x i32> *%CD,
+                 <4 x i32> *%CE, <4 x i32> *%CF, <4 x i32> *%CG, <4 x i32> *%CH,
+                 <4 x i32> *%CI, <4 x i32> *%CJ, <4 x i32> *%CK, <4 x i32> *%CL) {
+ %a = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 9, i32 87, i32 57, i32 8>
+ %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 99, i32 49, i32 29>
+ %c = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 18, i32 53, i32 84>
+ %d = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 29, i32 82, i32 45, i32 16>
+ %e = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 11, i32 15, i32 32, i32 99>
+ %f = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 19, i32 86, i32 29, i32 33>
+ %g = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 10, i32 26, i32 45>
+ %h = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 88, i32 70, i32 90, i32 48>
+ %i = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 30, i32 53, i32 42, i32 12>
+ %j = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 46, i32 24, i32 93, i32 26>
+ %k = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 33, i32 99, i32 15, i32 57>
+ %l = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 51, i32 60, i32 60, i32 50>
+ %m = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 50, i32 12, i32 7, i32 45>
+ %n = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 65, i32 36, i32 36>
+ %o = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 54, i32 0, i32 17, i32 78>
+ %p = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 56, i32 13, i32 64, i32 48>
+ %q = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 52, i32 69, i32 88, i32 11>, <4 x i32> zeroinitializer
+ %r = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 5, i32 87, i32 68, i32 14>, <4 x i32> zeroinitializer
+ %s = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 47, i32 17, i32 66, i32 63>, <4 x i32> zeroinitializer
+ %t = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 64, i32 25, i32 73, i32 81>, <4 x i32> zeroinitializer
+ %u = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 51, i32 41, i32 61, i32 63>, <4 x i32> zeroinitializer
+ %v = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 39, i32 59, i32 17, i32 0>, <4 x i32> zeroinitializer
+ %w = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 91, i32 99, i32 97, i32 29>, <4 x i32> zeroinitializer
+ %x = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 89, i32 45, i32 89, i32 10>, <4 x i32> zeroinitializer
+ %y = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 25, i32 70, i32 21, i32 27>, <4 x i32> zeroinitializer
+ %z = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 40, i32 12, i32 27, i32 88>, <4 x i32> zeroinitializer
+ %ba = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 36, i32 35, i32 90, i32 23>, <4 x i32> zeroinitializer
+ %bb = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 83, i32 3, i32 64, i32 82>, <4 x i32> zeroinitializer
+ %bc = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 15, i32 72, i32 2, i32 54>, <4 x i32> zeroinitializer
+ %bd = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 32, i32 47, i32 100, i32 84>, <4 x i32> zeroinitializer
+ %be = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 92, i32 57, i32 82, i32 1>, <4 x i32> zeroinitializer
+ %bf = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 42, i32 14, i32 22, i32 89>, <4 x i32> zeroinitializer
+ %bg = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 33, i32 10, i32 67, i32 66>, <4 x i32> <i32 42, i32 91, i32 47, i32 40>
+ %bh = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 8, i32 13, i32 48, i32 0>, <4 x i32> <i32 84, i32 66, i32 87, i32 84>
+ %bi = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 85, i32 96, i32 1, i32 94>, <4 x i32> <i32 54, i32 57, i32 7, i32 92>
+ %bj = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 55, i32 21, i32 92, i32 68>, <4 x i32> <i32 51, i32 61, i32 62, i32 39>
+ %bk = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 42, i32 18, i32 77, i32 74>, <4 x i32> <i32 82, i32 33, i32 30, i32 7>
+ %bl = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 80, i32 92, i32 61, i32 84>, <4 x i32> <i32 43, i32 89, i32 92, i32 6>
+ %bm = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 49, i32 14, i32 62, i32 62>, <4 x i32> <i32 35, i32 33, i32 92, i32 59>
+ %bn = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 3, i32 97, i32 49, i32 18>, <4 x i32> <i32 56, i32 64, i32 19, i32 75>
+ %bo = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 91, i32 57, i32 0, i32 1>, <4 x i32> <i32 43, i32 63, i32 64, i32 11>
+ %bp = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 41, i32 65, i32 18, i32 11>, <4 x i32> <i32 86, i32 26, i32 31, i32 3>
+ %bq = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 31, i32 46, i32 32, i32 68>, <4 x i32> <i32 100, i32 59, i32 62, i32 6>
+ %br = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 76, i32 67, i32 87, i32 7>, <4 x i32> <i32 63, i32 48, i32 97, i32 24>
+ %bs = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 83, i32 89, i32 19, i32 4>, <4 x i32> <i32 21, i32 2, i32 40, i32 21>
+ %bt = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 45, i32 76, i32 81, i32 100>, <4 x i32> <i32 65, i32 26, i32 100, i32 46>
+ %bu = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 16, i32 75, i32 31, i32 17>, <4 x i32> <i32 37, i32 66, i32 86, i32 65>
+ %bv = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 13, i32 25, i32 43, i32 59>, <4 x i32> <i32 82, i32 78, i32 60, i32 52>
+ %bw = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bx = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %by = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bz = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ca = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cb = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cc = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cd = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ce = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cf = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cg = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ch = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ci = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cj = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ck = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cl = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ store <4 x i32> %a, <4 x i32>* %A
+ store <4 x i32> %b, <4 x i32>* %B
+ store <4 x i32> %c, <4 x i32>* %C
+ store <4 x i32> %d, <4 x i32>* %D
+ store <4 x i32> %e, <4 x i32>* %E
+ store <4 x i32> %f, <4 x i32>* %F
+ store <4 x i32> %g, <4 x i32>* %G
+ store <4 x i32> %h, <4 x i32>* %H
+ store <4 x i32> %i, <4 x i32>* %I
+ store <4 x i32> %j, <4 x i32>* %J
+ store <4 x i32> %k, <4 x i32>* %K
+ store <4 x i32> %l, <4 x i32>* %L
+ store <4 x i32> %m, <4 x i32>* %M
+ store <4 x i32> %n, <4 x i32>* %N
+ store <4 x i32> %o, <4 x i32>* %O
+ store <4 x i32> %p, <4 x i32>* %P
+ store <4 x i32> %q, <4 x i32>* %Q
+ store <4 x i32> %r, <4 x i32>* %R
+ store <4 x i32> %s, <4 x i32>* %S
+ store <4 x i32> %t, <4 x i32>* %T
+ store <4 x i32> %u, <4 x i32>* %U
+ store <4 x i32> %v, <4 x i32>* %V
+ store <4 x i32> %w, <4 x i32>* %W
+ store <4 x i32> %x, <4 x i32>* %X
+ store <4 x i32> %y, <4 x i32>* %Y
+ store <4 x i32> %z, <4 x i32>* %Z
+ store <4 x i32> %ba, <4 x i32>* %BA
+ store <4 x i32> %bb, <4 x i32>* %BB
+ store <4 x i32> %bc, <4 x i32>* %BC
+ store <4 x i32> %bd, <4 x i32>* %BD
+ store <4 x i32> %be, <4 x i32>* %BE
+ store <4 x i32> %bf, <4 x i32>* %BF
+ store <4 x i32> %bg, <4 x i32>* %BG
+ store <4 x i32> %bh, <4 x i32>* %BH
+ store <4 x i32> %bi, <4 x i32>* %BI
+ store <4 x i32> %bj, <4 x i32>* %BJ
+ store <4 x i32> %bk, <4 x i32>* %BK
+ store <4 x i32> %bl, <4 x i32>* %BL
+ store <4 x i32> %bm, <4 x i32>* %BM
+ store <4 x i32> %bn, <4 x i32>* %BN
+ store <4 x i32> %bo, <4 x i32>* %BO
+ store <4 x i32> %bp, <4 x i32>* %BP
+ store <4 x i32> %bq, <4 x i32>* %BQ
+ store <4 x i32> %br, <4 x i32>* %BR
+ store <4 x i32> %bs, <4 x i32>* %BS
+ store <4 x i32> %bt, <4 x i32>* %BT
+ store <4 x i32> %bu, <4 x i32>* %BU
+ store <4 x i32> %bv, <4 x i32>* %BV
+ store <4 x i32> %bw, <4 x i32>* %BW
+ store <4 x i32> %bx, <4 x i32>* %BX
+ store <4 x i32> %by, <4 x i32>* %BY
+ store <4 x i32> %bz, <4 x i32>* %BZ
+ store <4 x i32> %ca, <4 x i32>* %CA
+ store <4 x i32> %cb, <4 x i32>* %CB
+ store <4 x i32> %cc, <4 x i32>* %CC
+ store <4 x i32> %cd, <4 x i32>* %CD
+ store <4 x i32> %ce, <4 x i32>* %CE
+ store <4 x i32> %cf, <4 x i32>* %CF
+ store <4 x i32> %cg, <4 x i32>* %CG
+ store <4 x i32> %ch, <4 x i32>* %CH
+ store <4 x i32> %ci, <4 x i32>* %CI
+ store <4 x i32> %cj, <4 x i32>* %CJ
+ store <4 x i32> %ck, <4 x i32>* %CK
+ store <4 x i32> %cl, <4 x i32>* %CL
  ret void
 }
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index eaff87d695ed9..8e064a4f2fc94 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -659,3 +659,21 @@ define i1 @test64(i8 %a, i32 %b) nounwind {
 ; CHECK-NEXT: %c = icmp eq i8 %1, %a
 ; CHECK-NEXT: ret i1 %c
 }
+
+define i1 @test65(i64 %A, i64 %B) {
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %A, %B
+  %cmp = icmp eq i64 %s1, %s2
+; CHECK: @test65
+; CHECK-NEXT: ret i1 true
+  ret i1 %cmp
+}
+
+define i1 @test66(i64 %A, i64 %B) {
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %B, %A
+  %cmp = icmp eq i64 %s1, %s2
+; CHECK: @test66
+; CHECK-NEXT: ret i1 true
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
new file mode 100644
index 0000000000000..4238c5f8fb153
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-1.ll
@@ -0,0 +1,72 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@foo = constant [4 x i8] c"foo\00"
+@hel = constant [4 x i8] c"hel\00"
+@hello_u = constant [8 x i8] c"hello_u\00"
+
+declare i32 @memcmp(i8*, i8*, i32)
+
+; Check memcmp(mem, mem, size) -> 0.
+
+define i32 @test_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i32 @memcmp(i8* %mem, i8* %mem, i32 %size)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+; Check memcmp(mem1, mem2, 0) -> 0.
+
+define i32 @test_simplify2(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify2
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 0)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+;; Check memcmp(mem1, mem2, 1) -> *(unsigned char*)mem1 - *(unsigned char*)mem2.
+
+define i32 @test_simplify3(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify3
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 1)
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %mem1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %mem2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ret i32 %ret
+; CHECK: ret i32 [[RET]]
+}
+
+; Check memcmp(mem1, mem2, size) -> cnst, where all arguments are constants.
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [8 x i8]* @hello_u, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+define i32 @test_simplify5() {
+; CHECK: @test_simplify5
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{[0-9]+}}
+}
+
+define i32 @test_simplify6() {
+; CHECK: @test_simplify6
+  %mem1 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{-[0-9]+}}
+}
diff --git a/test/Transforms/InstCombine/memcmp-2.ll b/test/Transforms/InstCombine/memcmp-2.ll
new file mode 100644
index 0000000000000..3796117bc24c6
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i32* @memcmp(i8*, i8*, i32)
+
+; Check that memcmp functions with the wrong prototype aren't simplified.
+
+define i32* @test_no_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i32* @memcmp(i8* %mem, i8* %mem, i32 %size)
+; CHECK-NEXT: call i32* @memcmp
+  ret i32* %ret
+; CHECK-NEXT: ret i32* %ret
+}
diff --git a/test/Transforms/InstCombine/memcpy-1.ll b/test/Transforms/InstCombine/memcpy-1.ll
new file mode 100644
index 0000000000000..65b79ad03df4b
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-1.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memcpy(i8*, i8*, i32)
+
+; Check memcpy(mem1, mem2, size) -> llvm.memcpy(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memcpy
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memcpy-2.ll b/test/Transforms/InstCombine/memcpy-2.ll
new file mode 100644
index 0000000000000..4a8a02018f5e7
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memcpy(i8*, i8*, i32)
+
+; Check that memcpy functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memcpy
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/ScalarRepl/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index 5557a8fd87540..83c893e17dd6d 100644
--- a/test/Transforms/ScalarRepl/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -S | FileCheck %s
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 @C.0.1248 = internal constant [128 x float] [ float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 ], align 32		; <[128 x float]*> [#uses=1]
 
@@ -6,13 +6,11 @@ define float @test1(i32 %hash, float %x, float %y, float %z, float %w) {
 entry:
 	%lookupTable = alloca [128 x float], align 16		; <[128 x float]*> [#uses=5]
 	%lookupTable1 = bitcast [128 x float]* %lookupTable to i8*		; <i8*> [#uses=1]
-	call void @llvm.memcpy.i32( i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i32 512, i32 16 )
+	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i64 512, i32 16, i1 false)
         
 ; CHECK: @test1
 ; CHECK-NOT: alloca
 ; CHECK-NOT: call{{.*}}@llvm.memcpy
-; CHECK: %lookupTable1 = bitcast [128 x float]* @C.0.1248 to i8*
-; CHECK-NOT: call{{.*}}@llvm.memcpy
         
 	%tmp3 = shl i32 %hash, 2		; <i32> [#uses=1]
 	%tmp5 = and i32 %tmp3, 124		; <i32> [#uses=4]
@@ -38,10 +36,6 @@ entry:
 	ret float %tmp43
 }
 
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
-
-
-
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 %T = type { i8, [123 x i8] }
@@ -59,10 +53,11 @@ define void @test2() {
 ; CHECK: @test2
 
 ; %A alloca is deleted
-; CHECK-NEXT: %B = alloca %T
+; CHECK-NEXT: alloca [124 x i8]
+; CHECK-NEXT: getelementptr inbounds [124 x i8]*
 
 ; use @G instead of %A
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* getelementptr inbounds (%T* @G, i64 0, i32 0)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %a, i64 124, i32 4, i1 false)
   call void @bar(i8* %b)
@@ -79,8 +74,7 @@ define void @test3() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test3
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -90,8 +84,7 @@ define void @test4() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a) 
 ; CHECK: @test4
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @baz(i8* byval %a)
+; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -103,8 +96,7 @@ define void @test5() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a) 
 ; CHECK: @test5
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @baz(i8* byval %a)
+; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -118,8 +110,7 @@ define void @test6() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([2 x %U]* @H to i8*), i64 20, i32 16, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test6
-; CHECK-NEXT: %a = bitcast
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* bitcast ([2 x %U]* @H to i8*))
   ret void
 }
 
@@ -129,8 +120,7 @@ define void @test7() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 0) to i8*), i64 20, i32 4, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test7
-; CHECK-NEXT: %a = bitcast
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* bitcast ([2 x %U]* @H to i8*))
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/memcpy_chk-1.ll b/test/Transforms/InstCombine/memcpy_chk-1.ll
new file mode 100644
index 0000000000000..7c7d91808a374
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy_chk-1.ll
@@ -0,0 +1,60 @@
+; Test lib call simplification of __memcpy_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T3 = type { [100 x i32], [100 x i32], [2048 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+@t3 = common global %struct.T3 zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T3* @t3 to i8*
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T3* @t3 to i8*
+  %src = bitcast %struct.T1* @t1 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret void
+}
+
+declare i8* @__memcpy_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memcpy_chk-2.ll b/test/Transforms/InstCombine/memcpy_chk-2.ll
new file mode 100644
index 0000000000000..aa43029d47fc2
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy_chk-2.ll
@@ -0,0 +1,24 @@
+; Test that lib call simplification doesn't simplify __memcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824)
+  ret void
+}
+
+declare i8* @__memcpy_chk(i8*, i8*, i64)
diff --git a/test/Transforms/InstCombine/memmove-1.ll b/test/Transforms/InstCombine/memmove-1.ll
new file mode 100644
index 0000000000000..53f2f116c7775
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-1.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memmove(i8*, i8*, i32)
+
+; Check memmove(mem1, mem2, size) -> llvm.memmove(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memmove
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memmove-2.ll b/test/Transforms/InstCombine/memmove-2.ll
new file mode 100644
index 0000000000000..23887bce31d8c
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-2.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memmove(i8*, i8*, i32)
+
+; Check that memmove functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memmove
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memmove_chk-1.ll b/test/Transforms/InstCombine/memmove_chk-1.ll
new file mode 100644
index 0000000000000..f9ff9a103a305
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove_chk-1.ll
@@ -0,0 +1,60 @@
+; Test lib call simplification of __memmove_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T3 = type { [100 x i32], [100 x i32], [2048 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+@t3 = common global %struct.T3 zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T3* @t3 to i8*
+
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T3* @t3 to i8*
+  %src = bitcast %struct.T1* @t1 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret void
+}
+
+declare i8* @__memmove_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memmove_chk-2.ll b/test/Transforms/InstCombine/memmove_chk-2.ll
new file mode 100644
index 0000000000000..f0a915fde2e93
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove_chk-2.ll
@@ -0,0 +1,24 @@
+; Test that lib call simplification doesn't simplify __memmove_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824)
+  ret void
+}
+
+declare i8* @__memmove_chk(i8*, i8*, i64)
diff --git a/test/Transforms/InstCombine/memset-1.ll b/test/Transforms/InstCombine/memset-1.ll
new file mode 100644
index 0000000000000..48b433e137c08
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-1.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memset(i8*, i32, i32)
+
+; Check memset(mem1, val, size) -> llvm.memset(mem1, val, size, 1).
+
+define i8* @test_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call void @llvm.memset
+  ret i8* %ret
+; CHECK: ret i8* %mem
+}
diff --git a/test/Transforms/InstCombine/memset-2.ll b/test/Transforms/InstCombine/memset-2.ll
new file mode 100644
index 0000000000000..8a9033302d044
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-2.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memset(i8*, i32, i32)
+
+; Check that memset functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call i8 @memset
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
new file mode 100644
index 0000000000000..be4c1cfccdb28
--- /dev/null
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -0,0 +1,61 @@
+; Test lib call simplification of __memset_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; rdar://7719085
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T = type { [100 x i32], [100 x i32], [1024 x i8] }
+@t = common global %struct.T zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
diff --git a/test/Transforms/InstCombine/memset_chk-2.ll b/test/Transforms/InstCombine/memset_chk-2.ll
new file mode 100644
index 0000000000000..60fbf163c212f
--- /dev/null
+++ b/test/Transforms/InstCombine/memset_chk-2.ll
@@ -0,0 +1,20 @@
+; Test that lib call simplification doesn't simplify __memset_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T = type { [100 x i32], [100 x i32], [1024 x i8] }
+@t = common global %struct.T zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64)
diff --git a/test/Transforms/InstCombine/memset_chk.ll b/test/Transforms/InstCombine/memset_chk.ll
deleted file mode 100644
index 58ecda582fd17..0000000000000
--- a/test/Transforms/InstCombine/memset_chk.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; rdar://7719085
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-
-%struct.data = type { [100 x i32], [100 x i32], [1024 x i8] }
-
-define i32 @t() nounwind ssp {
-; CHECK: @t
-; CHECK: @llvm.memset.p0i8.i64
-entry:
-  %0 = alloca %struct.data, align 8               ; <%struct.data*> [#uses=1]
-  %1 = bitcast %struct.data* %0 to i8*            ; <i8*> [#uses=1]
-  %2 = call i8* @__memset_chk(i8* %1, i32 0, i64 1824, i64 1824) nounwind ; <i8*> [#uses=0]
-  ret i32 0
-}
-
-declare i8* @__memset_chk(i8*, i32, i64, i64) nounwind
diff --git a/test/Transforms/InstCombine/obfuscated_splat.ll b/test/Transforms/InstCombine/obfuscated_splat.ll
new file mode 100644
index 0000000000000..c25dade168a42
--- /dev/null
+++ b/test/Transforms/InstCombine/obfuscated_splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -instcombine -S %s | FileCheck %s
+
+define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) {
+  %A = load <4 x float>* %in_ptr, align 16
+  %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
+  %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %D, <4 x float> *%out_ptr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index d7e292155cd79..31a3cb46e4595 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -247,7 +247,8 @@ entry:
 
 ; technically reachable, but this malformed IR may appear as a result of constant propagation
 xpto:
-  %gep = getelementptr i8* %gep, i32 1
+  %gep2 = getelementptr i8* %gep, i32 1
+  %gep = getelementptr i8* %gep2, i32 1
   %o = call i32 @llvm.objectsize.i32(i8* %gep, i1 true)
 ; CHECK: ret i32 undef
   ret i32 %o
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 4baae2618dde2..cc3aacdce3c87 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -829,3 +829,37 @@ define i1 @test63(i1 %A, i1 %B) {
 ; CHECK: %C = or i1 %B, %not
 ; CHECK: ret i1 %C
 }
+
+; PR14131
+define void @test64(i32 %p, i16 %b) noreturn nounwind {
+entry:
+  %p.addr.0.insert.mask = and i32 %p, -65536
+  %conv2 = and i32 %p, 65535
+  br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs:
+  %p.addr.0.extract.trunc = trunc i32 %p.addr.0.insert.mask to i16
+  %phitmp = zext i16 %p.addr.0.extract.trunc to i32
+  br label %lor.end
+
+lor.end:
+  %t.1 = phi i32 [ 0, %entry ], [ %phitmp, %lor.rhs ]
+  %conv6 = zext i16 %b to i32
+  %div = udiv i32 %conv6, %t.1
+  %tobool8 = icmp eq i32 %div, 0
+  %cmp = icmp eq i32 %t.1, 0
+  %cmp12 = icmp ult i32 %conv2, 2
+  %cmp.sink = select i1 %tobool8, i1 %cmp12, i1 %cmp
+  br i1 %cmp.sink, label %cond.end17, label %cond.false16
+
+cond.false16:
+  br label %cond.end17
+
+cond.end17:
+  br label %while.body
+
+while.body:
+  br label %while.body
+; CHECK: @test64
+; CHECK-NOT: select
+}
diff --git a/test/Transforms/InstCombine/stpcpy-1.ll b/test/Transforms/InstCombine/stpcpy-1.ll
new file mode 100644
index 0000000000000..8b6bb0e0d509b
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy-1.ll
@@ -0,0 +1,46 @@
+; Test that the stpcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @stpcpy(i8*, i8*)
+
+define i8* @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+; CHECK-NEXT: getelementptr inbounds ([32 x i8]* @a, i32 0, i32 5)
+  ret i8* %ret
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %dst)
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
+; CHECK-NEXT: getelementptr inbounds [32 x i8]* @a, i32 0, i32 [[LEN]]
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @stpcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/stpcpy-2.ll b/test/Transforms/InstCombine/stpcpy-2.ll
new file mode 100644
index 0000000000000..2e92c0895ed48
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the stpcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @stpcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @stpcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @stpcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
new file mode 100644
index 0000000000000..05603918c6424
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -0,0 +1,96 @@
+; Test lib call simplification of __stpcpy_chk calls with various values
+; for src, dst, and slen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where slen >= strlen (src).
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check cases where there are no string constants.
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @stpcpy
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check case where the string length is not constant.
+
+define i8* @test_simplify5() {
+; CHECK: @test_simplify5
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
+; CHECK: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  ret i8* %ret
+}
+
+; Check case where the source and destination are the same.
+
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
+; CHECK-NEXT: getelementptr inbounds [60 x i8]* @a, i32 0, i32 [[LEN]]
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
+}
+
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__stpcpy_chk
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/stpcpy_chk-2.ll b/test/Transforms/InstCombine/stpcpy_chk-2.ll
new file mode 100644
index 0000000000000..46c2139276e21
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __stpcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@.str = private constant [8 x i8] c"abcdefg\00"
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strcpy_chk
+  call i16* @__strcpy_chk(i16* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i16* @__strcpy_chk(i16*, i8*, i32)
diff --git a/test/Transforms/InstCombine/strcat-1.ll b/test/Transforms/InstCombine/strcat-1.ll
new file mode 100644
index 0000000000000..3c05d6b06fa01
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-1.ll
@@ -0,0 +1,38 @@
+; Test that the strcat libcall simplifier works correctly per the
+; bug found in PR3661.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+
+declare i8* @strcat(i8*, i8*)
+declare i32 @puts(i8*)
+
+define i32 @main() {
+; CHECK: @main
+; CHECK-NOT: call i8* @strcat
+; CHECK: call i32 @puts
+
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  ; rslt1 = strcat(target, "hello\00")
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strcat(i8* %arg1, i8* %arg2)
+
+  ; rslt2 = strcat(rslt1, "\00")
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strcat(i8* %rslt1, i8* %arg3)
+
+  ; rslt3 = strcat(rslt2, "\00hello\00")
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strcat(i8* %rslt2, i8* %arg4)
+
+  call i32 @puts( i8* %rslt3 )
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/strcat-2.ll b/test/Transforms/InstCombine/strcat-2.ll
new file mode 100644
index 0000000000000..379ee74953172
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-2.ll
@@ -0,0 +1,32 @@
+; Test that the strcat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcat(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strcat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strcat(i8* %dst, i8* %src)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  call i8* @strcat(i8* %dst, i8* %src)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcat-3.ll b/test/Transforms/InstCombine/strcat-3.ll
new file mode 100644
index 0000000000000..15aff2f1aa289
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-3.ll
@@ -0,0 +1,22 @@
+; Test that the strcat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcat(i8*, i8*)
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i16* @strcat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i16* @strcat(i8* %dst, i8* %src)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strchr-1.ll b/test/Transforms/InstCombine/strchr-1.ll
new file mode 100644
index 0000000000000..5efab9ec4bee0
--- /dev/null
+++ b/test/Transforms/InstCombine/strchr-1.ll
@@ -0,0 +1,54 @@
+; Test that the strchr library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@null = constant [1 x i8] zeroinitializer
+@chp = global i8* zeroinitializer
+
+declare i8* @strchr(i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 6)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: store i8* null, i8** @chp, align 4
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 0)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify4(i32 %chr) {
+; CHECK: call i8* @memchr
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 %chr)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strchr-2.ll b/test/Transforms/InstCombine/strchr-2.ll
new file mode 100644
index 0000000000000..35bbd23e6d4d1
--- /dev/null
+++ b/test/Transforms/InstCombine/strchr-2.ll
@@ -0,0 +1,21 @@
+; Test that the strchr libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@chr = global i8 zeroinitializer
+
+declare i8 @strchr(i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: test_nosimplify1
+; CHECK: call i8 @strchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8 @strchr(i8* %str, i32 119)
+  store i8 %dst, i8* @chr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcmp-1.ll b/test/Transforms/InstCombine/strcmp-1.ll
new file mode 100644
index 0000000000000..0679246e0915b
--- /dev/null
+++ b/test/Transforms/InstCombine/strcmp-1.ll
@@ -0,0 +1,82 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+@bell = constant [5 x i8] c"bell\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i32 @strcmp(i8*, i8*)
+
+; strcmp("", x) -> -*x
+define i32 @test1(i8* %str2) {
+; CHECK: @test1
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+
+}
+
+; strcmp(x, "") -> *x
+define i32 @test2(i8* %str1) {
+; CHECK: @test2
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)  -> cnst
+define i32 @test3() {
+; CHECK: @test3
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+define i32 @test4() {
+; CHECK: @test4
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)   -> memcmp(x, y, <known length>)
+; (This transform is rather difficult to trigger in a useful manner)
+define i32 @test5(i1 %b) {
+; CHECK: @test5
+; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %str2, i32 5)
+; CHECK: ret i32 %memcmp
+
+  %str1 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %temp2 = getelementptr inbounds [5 x i8]* @bell, i32 0, i32 0
+  %str2 = select i1 %b, i8* %temp1, i8* %temp2
+  %temp3 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp3
+}
+
+; strcmp(x,x)  -> 0
+define i32 @test6(i8* %str) {
+; CHECK: @test6
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strcmp(i8* %str, i8* %str)
+  ret i32 %temp1
+}
diff --git a/test/Transforms/InstCombine/strcmp-2.ll b/test/Transforms/InstCombine/strcmp-2.ll
new file mode 100644
index 0000000000000..20518960f302e
--- /dev/null
+++ b/test/Transforms/InstCombine/strcmp-2.ll
@@ -0,0 +1,20 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+
+declare i16 @strcmp(i8*, i8*)
+
+define i16 @test_nosimplify() {
+; CHECK: @test_nosimplify
+; CHECK: call i16 @strcmp
+; CHECK: ret i16 %temp1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i16 @strcmp(i8* %str1, i8* %str2)
+  ret i16 %temp1
+}
diff --git a/test/Transforms/InstCombine/strcpy-1.ll b/test/Transforms/InstCombine/strcpy-1.ll
new file mode 100644
index 0000000000000..b6cf048b2a81c
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-1.ll
@@ -0,0 +1,45 @@
+; Test that the strcpy library call simplifier works correctly.
+; rdar://6839935
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcpy(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %dst)
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @strcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strcpy-2.ll b/test/Transforms/InstCombine/strcpy-2.ll
new file mode 100644
index 0000000000000..779e9fdd9598d
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @strcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
new file mode 100644
index 0000000000000..3e48f4fd3057a
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -0,0 +1,94 @@
+; Test lib call simplification of __strcpy_chk calls with various values
+; for src, dst, and slen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where slen >= strlen (src).
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check cases where there are no string constants.
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @strcpy
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check case where the string length is not constant.
+
+define void @test_simplify5() {
+; CHECK: @test_simplify5
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
+  ret void
+}
+
+; Check case where the source and destination are the same.
+
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+
+; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
+}
+
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strcpy_chk
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strcpy_chk-2.ll b/test/Transforms/InstCombine/strcpy_chk-2.ll
new file mode 100644
index 0000000000000..d76ea5d068bcd
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __strcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@.str = private constant [8 x i8] c"abcdefg\00"
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strcpy_chk
+  call i16* @__strcpy_chk(i16* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i16* @__strcpy_chk(i16*, i8*, i32)
diff --git a/test/Transforms/InstCombine/strcpy_chk.ll b/test/Transforms/InstCombine/strcpy_chk.ll
deleted file mode 100644
index 8835a0ba467cd..0000000000000
--- a/test/Transforms/InstCombine/strcpy_chk.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-@a = common global [60 x i8] zeroinitializer, align 1 ; <[60 x i8]*> [#uses=1]
-@.str = private constant [8 x i8] c"abcdefg\00"   ; <[8 x i8]*> [#uses=1]
-
-define i8* @foo() nounwind {
-; CHECK: @foo
-; CHECK-NEXT: call i8* @strcpy
-  %call = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 60) ; <i8*> [#uses=1]
-  ret i8* %call
-}
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strcspn-1.ll b/test/Transforms/InstCombine/strcspn-1.ll
new file mode 100644
index 0000000000000..60fad897b2c80
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-1.ll
@@ -0,0 +1,57 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strcspn(i8*, i8*)
+
+; Check strcspn(s, "") -> strlen(s).
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i64 @strlen(i8* %str)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 [[VAR]]
+}
+
+; Check strcspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strcspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strcspn-2.ll b/test/Transforms/InstCombine/strcspn-2.ll
new file mode 100644
index 0000000000000..4e2393686c7d6
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-2.ll
@@ -0,0 +1,21 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = constant [1 x i8] zeroinitializer
+
+declare double @strcspn(i8*, i8*)
+
+; Check that strcspn functions with the wrong prototype aren't simplified.
+
+define double @test_no_simplify1(i8* %pat) {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call double @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: call double @strcspn
+  ret double %ret
+; CHECK-NEXT: ret double %ret
+}
diff --git a/test/Transforms/InstCombine/strlen-1.ll b/test/Transforms/InstCombine/strlen-1.ll
new file mode 100644
index 0000000000000..6d7464a4cc802
--- /dev/null
+++ b/test/Transforms/InstCombine/strlen-1.ll
@@ -0,0 +1,97 @@
+; Test that the strlen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+@nullstring = constant i8 0
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i32 @strlen(i8*)
+
+; Check strlen(string constant) -> integer constant.
+
+define i32 @test_simplify1() {
+; CHECK: @test_simplify1
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  ret i32 %hello_l
+; CHECK-NEXT: ret i32 5
+}
+
+define i32 @test_simplify2() {
+; CHECK: @test_simplify2
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  ret i32 %null_l
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test_simplify3() {
+; CHECK: @test_simplify3
+  %null_hello_p = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %null_hello_l = call i32 @strlen(i8* %null_hello_p)
+  ret i32 %null_hello_l
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %len = tail call i32 @strlen(i8* @nullstring) nounwind
+  ret i32 %len
+; CHECK-NEXT: ret i32 0
+}
+
+; Check strlen(x) == 0 --> *x == 0.
+
+define i1 @test_simplify5() {
+; CHECK: @test_simplify5
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  %eq_hello = icmp eq i32 %hello_l, 0
+  ret i1 %eq_hello
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @test_simplify6() {
+; CHECK: @test_simplify6
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  %eq_null = icmp eq i32 %null_l, 0
+  ret i1 %eq_null
+; CHECK-NEXT: ret i1 true
+}
+
+; Check strlen(x) != 0 --> *x != 0.
+
+define i1 @test_simplify7() {
+; CHECK: @test_simplify7
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  %ne_hello = icmp ne i32 %hello_l, 0
+  ret i1 %ne_hello
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @test_simplify8() {
+; CHECK: @test_simplify8
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  %ne_null = icmp ne i32 %null_l, 0
+  ret i1 %ne_null
+; CHECK-NEXT: ret i1 false
+}
+
+; Check cases that shouldn't be simplified.
+
+define i32 @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %a_p = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %a_l = call i32 @strlen(i8* %a_p)
+; CHECK-NEXT: %a_l = call i32 @strlen
+  ret i32 %a_l
+; CHECK-NEXT: ret i32 %a_l
+}
diff --git a/test/Transforms/InstCombine/strlen-2.ll b/test/Transforms/InstCombine/strlen-2.ll
new file mode 100644
index 0000000000000..c4fd54c06db95
--- /dev/null
+++ b/test/Transforms/InstCombine/strlen-2.ll
@@ -0,0 +1,18 @@
+; Test that the strlen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+
+declare i32 @strlen(i8*, i32)
+
+define i32 @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p, i32 187)
+; CHECK-NEXT: %hello_l = call i32 @strlen
+  ret i32 %hello_l
+; CHECK-NEXT: ret i32 %hello_l
+}
diff --git a/test/Transforms/InstCombine/strncat-1.ll b/test/Transforms/InstCombine/strncat-1.ll
new file mode 100644
index 0000000000000..ad2a18b1465d7
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-1.ll
@@ -0,0 +1,37 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+
+declare i8* @strncat(i8*, i8*, i32)
+declare i32 @puts(i8*)
+
+define i32 @main() {
+; CHECK: @main
+; CHECK-NOT: call i8* @strncat
+; CHECK: call i32 @puts
+
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  ; rslt1 = strncat(target, "hello\00")
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strncat(i8* %arg1, i8* %arg2, i32 6)
+
+  ; rslt2 = strncat(rslt1, "\00")
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strncat(i8* %rslt1, i8* %arg3, i32 42)
+
+  ; rslt3 = strncat(rslt2, "\00hello\00")
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strncat(i8* %rslt2, i8* %arg4, i32 42)
+
+  call i32 @puts(i8* %rslt3)
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/strncat-2.ll b/test/Transforms/InstCombine/strncat-2.ll
new file mode 100644
index 0000000000000..c56deacd39bb5
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-2.ll
@@ -0,0 +1,53 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strncat(i8*, i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 0)
+  ret void
+}
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i8* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 1)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncat-3.ll b/test/Transforms/InstCombine/strncat-3.ll
new file mode 100644
index 0000000000000..3cd797168705f
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-3.ll
@@ -0,0 +1,22 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strncat(i8*, i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i16* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i16* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncmp-1.ll b/test/Transforms/InstCombine/strncmp-1.ll
new file mode 100644
index 0000000000000..187c2fa50e82a
--- /dev/null
+++ b/test/Transforms/InstCombine/strncmp-1.ll
@@ -0,0 +1,99 @@
+; Test that the strncmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+@bell = constant [5 x i8] c"bell\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i32 @strncmp(i8*, i8*, i32)
+
+; strncmp("", x, n) -> -*x
+define i32 @test1(i8* %str2) {
+; CHECK: @test1
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+; strncmp(x, "", n) -> *x
+define i32 @test2(i8* %str1) {
+; CHECK: @test2
+; CHECK: %strcmpload = load i8* %str1
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+; strncmp(x, y, n)  -> cnst
+define i32 @test3() {
+; CHECK: @test3
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+define i32 @test4() {
+; CHECK: @test4
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+define i32 @test5() {
+; CHECK: @test5
+; CHECK: ret i32 0
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 4)
+  ret i32 %temp1
+}
+
+; strncmp(x,y,1) -> memcmp(x,y,1)
+define i32 @test6(i8* %str1, i8* %str2) {
+; CHECK: @test6
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %str1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %str2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK: ret i32 [[RET]]
+
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
+  ret i32 %temp1
+}
+
+; strncmp(x,y,0)   -> 0
+define i32 @test7(i8* %str1, i8* %str2) {
+; CHECK: @test7
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 0)
+  ret i32 %temp1
+}
+
+; strncmp(x,x,n)  -> 0
+define i32 @test8(i8* %str, i32 %n) {
+; CHECK: @test8
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strncmp(i8* %str, i8* %str, i32 %n)
+  ret i32 %temp1
+}
diff --git a/test/Transforms/InstCombine/strncmp-2.ll b/test/Transforms/InstCombine/strncmp-2.ll
new file mode 100644
index 0000000000000..3fc43a6fd4f5c
--- /dev/null
+++ b/test/Transforms/InstCombine/strncmp-2.ll
@@ -0,0 +1,20 @@
+; Test that the strncmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+
+declare i16 @strncmp(i8*, i8*, i32)
+
+define i16 @test_nosimplify() {
+; CHECK: @test_nosimplify
+; CHECK: call i16 @strncmp
+; CHECK: ret i16 %temp1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i16 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i16 %temp1
+}
diff --git a/test/Transforms/InstCombine/strncpy-1.ll b/test/Transforms/InstCombine/strncpy-1.ll
new file mode 100644
index 0000000000000..3ce2b9b5eecc3
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy-1.ll
@@ -0,0 +1,95 @@
+; Test that the strncpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strncpy(i8*, i8*, i32)
+declare i32 @puts(i8*)
+
+; Check a bunch of strncpy invocations together.
+
+define i32 @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strncpy
+; CHECK: call i32 @puts
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strncpy(i8* %arg1, i8* %arg2, i32 6)
+
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strncpy(i8* %rslt1, i8* %arg3, i32 42)
+
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strncpy(i8* %rslt2, i8* %arg4, i32 42)
+
+  call i32 @puts( i8* %rslt3 )
+  ret i32 0
+}
+
+; Check strncpy(x, "", y) -> memset(x, '\0', y, 1).
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 32)
+; CHECK: call void @llvm.memset.p0i8.i32
+  ret void
+}
+
+; Check strncpy(x, y, 0) -> x.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  %ret = call i8* @strncpy(i8* %dst, i8* %src, i32 0)
+  ret i8* %ret
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+}
+
+; Check  strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant].
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 6)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+; Check cases that shouldn't be simplified.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 32)
+; CHECK: call i8* @strncpy
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 8)
+; CHECK: call i8* @strncpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncpy-2.ll b/test/Transforms/InstCombine/strncpy-2.ll
new file mode 100644
index 0000000000000..ac28ea6550097
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strncpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strncpy(i8*, i8*, i32)
+
+; Check that 'strncpy' functions with the wrong prototype aren't simplified.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strncpy(i8* %dst, i8* %src, i32 6)
+; CHECK: call i16* @strncpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncpy_chk-1.ll b/test/Transforms/InstCombine/strncpy_chk-1.ll
new file mode 100644
index 0000000000000..aadff4268ec2b
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy_chk-1.ll
@@ -0,0 +1,66 @@
+; Test lib call simplification of __strncpy_chk calls with various values
+; for len and dstlen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where dstlen >= len
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @strncpy
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret void
+}
+
+; Check cases where dstlen < len
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strncpy_chk
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strncpy_chk
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
+  ret void
+}
+
+declare i8* @__strncpy_chk(i8*, i8*, i32, i32)
diff --git a/test/Transforms/InstCombine/strncpy_chk-2.ll b/test/Transforms/InstCombine/strncpy_chk-2.ll
new file mode 100644
index 0000000000000..a0f132ebf63bc
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __strncpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@b = common global [60 x i16] zeroinitializer, align 1
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i16]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strncpy_chk
+  call i16* @__strncpy_chk(i16* %dst, i16* %src, i32 60, i32 60)
+  ret void
+}
+
+declare i16* @__strncpy_chk(i16*, i16*, i32, i32)
diff --git a/test/Transforms/InstCombine/strpbrk-1.ll b/test/Transforms/InstCombine/strpbrk-1.ll
new file mode 100644
index 0000000000000..a5d0d86501b14
--- /dev/null
+++ b/test/Transforms/InstCombine/strpbrk-1.ll
@@ -0,0 +1,68 @@
+; Test that the strpbrk library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [12 x i8] c"hello world\00"
+@w = constant [2 x i8] c"w\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i8* @strpbrk(i8*, i8*)
+
+; Check strpbrk(s, "") -> NULL.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* null
+}
+
+; Check strpbrk("", s) -> NULL.
+
+define i8* @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* null
+}
+
+; Check strpbrk(s1, s2), where s1 and s2 are constants.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [12 x i8]* @hello, i32 0, i32 0
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* getelementptr inbounds ([12 x i8]* @hello, i32 0, i32 6)
+}
+
+; Check strpbrk(s, "a") -> strchr(s, 'a').
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i8* @strchr(i8* %str, i32 119)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* [[VAR]]
+}
+
+; Check cases that shouldn't be simplified.
+
+define i8* @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strpbrk-2.ll b/test/Transforms/InstCombine/strpbrk-2.ll
new file mode 100644
index 0000000000000..31ac2905df2c8
--- /dev/null
+++ b/test/Transforms/InstCombine/strpbrk-2.ll
@@ -0,0 +1,23 @@
+; Test that the strpbrk library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [12 x i8] c"hello world\00"
+@w = constant [2 x i8] c"w\00"
+
+declare i16* @strpbrk(i8*, i8*)
+
+; Check that 'strpbrk' functions with the wrong prototype aren't simplified.
+
+define i16* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [12 x i8]* @hello, i32 0, i32 0
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i16* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i16* @strpbrk
+  ret i16* %ret
+; CHECK-NEXT: ret i16* %ret
+}
diff --git a/test/Transforms/InstCombine/strrchr-1.ll b/test/Transforms/InstCombine/strrchr-1.ll
new file mode 100644
index 0000000000000..854ce45bffb24
--- /dev/null
+++ b/test/Transforms/InstCombine/strrchr-1.ll
@@ -0,0 +1,54 @@
+; Test that the strrchr library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@null = constant [1 x i8] zeroinitializer
+@chp = global i8* zeroinitializer
+
+declare i8* @strrchr(i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 6)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: store i8* null, i8** @chp, align 4
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 0)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_nosimplify1(i32 %chr) {
+; CHECK: @test_nosimplify1
+; CHECK: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 %chr)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strrchr-2.ll b/test/Transforms/InstCombine/strrchr-2.ll
new file mode 100644
index 0000000000000..1974f6ca6033d
--- /dev/null
+++ b/test/Transforms/InstCombine/strrchr-2.ll
@@ -0,0 +1,21 @@
+; Test that the strrchr libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@chr = global i8 zeroinitializer
+
+declare i8 @strrchr(i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: test_nosimplify1
+; CHECK: call i8 @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8 @strrchr(i8* %str, i32 119)
+  store i8 %dst, i8* @chr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strspn-1.ll b/test/Transforms/InstCombine/strspn-1.ll
new file mode 100644
index 0000000000000..393f88735bd42
--- /dev/null
+++ b/test/Transforms/InstCombine/strspn-1.ll
@@ -0,0 +1,56 @@
+; Test that the strspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strspn(i8*, i8*)
+
+; Check strspn(s, "") -> 0.
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 5
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strstr-1.ll b/test/Transforms/InstCombine/strstr-1.ll
new file mode 100644
index 0000000000000..81f52718747dd
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-1.ll
@@ -0,0 +1,65 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str = private constant [1 x i8] zeroinitializer
+@.str1 = private constant [2 x i8] c"a\00"
+@.str2 = private constant [6 x i8] c"abcde\00"
+@.str3 = private constant [4 x i8] c"bcd\00"
+
+declare i8* @strstr(i8*, i8*)
+
+; Check strstr(str, "") -> str.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, "a") -> strchr(str, 'a').
+
+define i8* @test_simplify2(i8* %str) {
+; CHECK: @test_simplify2
+  %pat = getelementptr inbounds [2 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: @strchr(i8* %str, i32 97)
+}
+
+; Check strstr("abcde", "bcd") -> "abcde" + 1.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr inbounds [6 x i8]* @.str2, i32 0, i32 0
+  %pat = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 1)
+}
+
+; Check strstr(str, str) -> str.
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %ret = call i8* @strstr(i8* %str, i8* %str)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, pat) == str -> strncmp(str, pat, strlen(str)) == 0.
+
+define i1 @test_simplify5(i8* %str, i8* %pat) {
+; CHECK: @test_simplify5
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  %cmp = icmp eq i8* %ret, %str
+  ret i1 %cmp
+; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %pat)
+; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %str, i8* %pat, {{i[0-9]+}} [[LEN]])
+; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
+; CHECK: ret i1
+}
diff --git a/test/Transforms/InstCombine/strstr-2.ll b/test/Transforms/InstCombine/strstr-2.ll
new file mode 100644
index 0000000000000..5092f9b4f8031
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-2.ll
@@ -0,0 +1,18 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = private constant [1 x i8] zeroinitializer
+
+declare i8 @strstr(i8*, i8*)
+
+define i8 @test_no_simplify1(i8* %str) {
+; CHECK: @test_no_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %ret = call i8 @strstr(i8* %str, i8* %pat)
+; CHECK-NEXT: call i8 @strstr
+  ret i8 %ret
+; CHECK-NEXT: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/strto-1.ll b/test/Transforms/InstCombine/strto-1.ll
new file mode 100644
index 0000000000000..16c0c67970db8
--- /dev/null
+++ b/test/Transforms/InstCombine/strto-1.ll
@@ -0,0 +1,82 @@
+; Test that the strto* library call simplifiers works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i64 @strtol(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtol(i8*, i8**, i32)
+
+declare double @strtod(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare double @strtod(i8*, i8**, i32)
+
+declare float @strtof(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare float @strtof(i8*, i8**, i32)
+
+declare i64 @strtoul(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoul(i8*, i8**, i32)
+
+declare i64 @strtoll(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoll(i8*, i8**, i32)
+
+declare double @strtold(i8* %s, i8** %endptr)
+; CHECK: declare double @strtold(i8*, i8**)
+
+declare i64 @strtoull(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoull(i8*, i8**, i32)
+
+define void @test_simplify1(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify1
+  call i64 @strtol(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtol(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify2(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify2
+  call double @strtod(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call double @strtod(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify3(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify3
+  call float @strtof(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call float @strtof(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify4(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify4
+  call i64 @strtoul(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoul(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify5(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify5
+  call i64 @strtoll(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoll(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify6(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify6
+  call double @strtold(i8* %x, i8** null)
+; CHECK-NEXT: call double @strtold(i8* nocapture %x, i8** null)
+  ret void
+}
+
+define void @test_simplify7(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify7
+  call i64 @strtoull(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoull(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_no_simplify1(i8* %x, i8** %endptr) {
+; CHECK: @test_no_simplify1
+  call i64 @strtol(i8* %x, i8** %endptr, i32 10)
+; CHECK-NEXT: call i64 @strtol(i8* %x, i8** %endptr, i32 10)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/struct-assign-tbaa.ll b/test/Transforms/InstCombine/struct-assign-tbaa.ll
new file mode 100644
index 0000000000000..33a771e6d8b6f
--- /dev/null
+++ b/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -0,0 +1,44 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+; Verify that instcombine preserves TBAA tags when converting a memcpy into
+; a scalar load and store.
+
+%struct.test1 = type { float }
+
+; CHECK: @test
+; CHECK: %2 = load float* %0, align 4, !tbaa !0
+; CHECK: store float %2, float* %1, align 4, !tbaa !0
+; CHECK: ret
+define void @test1(%struct.test1* nocapture %a, %struct.test1* nocapture %b) {
+entry:
+  %0 = bitcast %struct.test1* %a to i8*
+  %1 = bitcast %struct.test1* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !tbaa.struct !3
+  ret void
+}
+
+%struct.test2 = type { i32 (i8*, i32*, double*)** }
+
+define i32 (i8*, i32*, double*)*** @test2() {
+; CHECK: @test2
+; CHECK-NOT: memcpy
+; CHECK: ret
+  %tmp = alloca %struct.test2, align 8
+  %tmp1 = bitcast %struct.test2* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* undef, i64 8, i32 8, i1 false), !tbaa.struct !4
+  %tmp2 = getelementptr %struct.test2* %tmp, i32 0, i32 0
+  %tmp3 = load i32 (i8*, i32*, double*)*** %tmp2
+  ret i32 (i8*, i32*, double*)*** %tmp2
+}
+
+; CHECK: !0 = metadata !{metadata !"float", metadata !1}
+
+!0 = metadata !{metadata !"Simple C/C++ TBAA"}
+!1 = metadata !{metadata !"omnipotent char", metadata !0}
+!2 = metadata !{metadata !"float", metadata !0}
+!3 = metadata !{i64 0, i64 4, metadata !2}
+!4 = metadata !{i64 0, i64 8, null}
diff --git a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
index d95e8f8359085..74f2fdd7cc63e 100644
--- a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
+++ b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
@@ -6,9 +6,9 @@
 ; The udiv instructions shouldn't be optimized away, and the
 ; sext instructions should be optimized to zext.
 
-define i64 @bar(i32 %x) nounwind {
+define i64 @bar(i32 %x, i32 %g) nounwind {
   %y = lshr i32 %x, 30
-  %r = udiv i32 %y, 3
+  %r = udiv i32 %y, %g
   %z = sext i32 %r to i64
   ret i64 %z
 }
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 0019a57627cb2..2d90750a2f1e6 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -196,7 +196,7 @@ define <4 x float> @test_select(float %f, float %g) {
 ; CHECK-NOT: insertelement
 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
 ; CHECK-NOT: insertelement
-; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
+; CHECK: shufflevector <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %a0 = insertelement <4 x float> undef, float %f, i32 0
   %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
   %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 8f78c2e6bd505..14f532195d7c3 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -153,3 +153,46 @@ define <8 x i8> @test12a(<8 x i8> %tmp6, <8 x i8> %tmp2) nounwind {
   ret <8 x i8> %tmp3
 }
 
+; We should form a shuffle out of a select with constant condition.
+define <4 x i16> @test13a(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13a
+; CHECK-NEXT: shufflevector <4 x i16> %lhs, <4 x i16> %rhs, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13b(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13b
+; CHECK-NEXT: ret <4 x i16> %lhs
+  %A = select <4 x i1> <i1 true, i1 undef, i1 true, i1 true>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13c(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13c
+; CHECK-NEXT: shufflevector <4 x i16> %lhs, <4 x i16> %rhs, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 undef, i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13d(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13d
+; CHECK: select
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 icmp ugt (<4 x i16>(<4 x i16>, <4 x i16>)* @test13a, <4 x i16>(<4 x i16>, <4 x i16>)* @test13b), i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13e(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13e
+; CHECK-NEXT: ret <4 x i16> %rhs
+  %A = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
diff --git a/test/Transforms/InstCombine/vector_gep2.ll b/test/Transforms/InstCombine/vector_gep2.ll
new file mode 100644
index 0000000000000..20165b1100160
--- /dev/null
+++ b/test/Transforms/InstCombine/vector_gep2.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x i8*> @testa(<2 x i8*> %a) {
+; CHECK: @testa
+  %g = getelementptr <2 x i8*> %a, <2 x i32> <i32 0, i32 1>
+; CHECK: getelementptr <2 x i8*> %a, <2 x i64> <i64 0, i64 1>
+  ret <2 x i8*> %g
+}
diff --git a/test/Transforms/InstCombine/weak-symbols.ll b/test/Transforms/InstCombine/weak-symbols.ll
new file mode 100644
index 0000000000000..0039b5962f748
--- /dev/null
+++ b/test/Transforms/InstCombine/weak-symbols.ll
@@ -0,0 +1,33 @@
+; PR4738 - Test that the library call simplifier doesn't assume anything about
+; weak symbols.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+@real_init = weak_odr constant [2 x i8] c"y\00"
+@fake_init = weak constant [2 x i8] c"y\00"
+@.str = private constant [2 x i8] c"y\00"
+
+define i32 @foo() nounwind {
+; CHECK: define i32 @foo
+; CHECK: call i32 @strcmp
+; CHECK: ret i32 %temp1
+
+entry:
+  %str1 = getelementptr inbounds [2 x i8]* @fake_init, i64 0, i64 0
+  %str2 = getelementptr inbounds [2 x i8]* @.str, i64 0, i64 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2) nounwind readonly
+  ret i32 %temp1
+}
+
+define i32 @bar() nounwind {
+; CHECK: define i32 @bar
+; CHECK: ret i32 0
+
+entry:
+  %str1 = getelementptr inbounds [2 x i8]* @real_init, i64 0, i64 0
+  %str2 = getelementptr inbounds [2 x i8]* @.str, i64 0, i64 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2) nounwind readonly
+  ret i32 %temp1
+}
+
+declare i32 @strcmp(i8*, i8*) nounwind readonly
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index ced74bd4be9ba..ce2bb799c813a 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -266,6 +266,15 @@ define i1 @add5(i32 %x, i32 %y) {
 ; CHECK: ret i1 true
 }
 
+define i1 @add6(i64 %A, i64 %B) {
+; CHECK: @add6
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %B, %A
+  %cmp = icmp eq i64 %s1, %s2
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
 define i1 @addpowtwo(i32 %x, i32 %y) {
 ; CHECK: @addpowtwo
   %l = lshr i32 %x, 1
diff --git a/test/Transforms/Internalize/2008-05-09-AllButMain.ll b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
index a85e834582d7e..c07abb0c63652 100644
--- a/test/Transforms/Internalize/2008-05-09-AllButMain.ll
+++ b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
@@ -1,27 +1,55 @@
-; No arguments means internalize all but main
-; RUN: opt < %s -internalize -S | grep internal | count 4
+; No arguments means internalize everything
+; RUN: opt < %s -internalize -S | FileCheck --check-prefix=NOARGS %s
+
 ; Internalize all but foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | grep internal | count 3
-; Non existent files should be treated as if they were empty (so internalize all but main)
-; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | grep internal | count 4
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file 2> /dev/null -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | FileCheck --check-prefix=LIST %s
+
+; Non existent files should be treated as if they were empty (so internalize
+; everything)
+; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | FileCheck --check-prefix=EMPTYFILE %s
+
+; RUN: opt < %s -S -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file  2> /dev/null | FileCheck --check-prefix=LIST2 %s
+
 ; -file and -list options should be merged, the .apifile contains foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | grep internal | count 2
+; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | FileCheck --check-prefix=MERGE %s
+
+; NOARGS: @i = internal global
+; LIST: @i = internal global
+; EMPTYFILE: @i = internal global
+; LIST2: @i = internal global
+; MERGE: @i = internal global
+@i = global i32 0
 
-@i = weak global i32 0          ; <i32*> [#uses=0]
-@j = weak global i32 0          ; <i32*> [#uses=0]
+; NOARGS: @j = internal global
+; LIST: @j = global
+; EMPTYFILE: @j = internal global
+; LIST2: @j = internal global
+; MERGE: @j = global
+@j = global i32 0
 
-define void @main(...) {
-entry:  
+; NOARGS: define internal void @main
+; LIST: define internal void @main
+; EMPTYFILE: define internal void @main
+; LIST2: define internal void @main
+; MERGE: define internal void @main
+define void @main() {
         ret void
 }
 
-define void @foo(...) {
-entry:  
+; NOARGS: define internal void @foo
+; LIST: define void @foo
+; EMPTYFILE: define internal void @foo
+; LIST2: define void @foo
+; MERGE: define void @foo
+define void @foo() {
         ret void
 }
 
-define void @bar(...) {
-entry:  
+; NOARGS: define internal void @bar
+; LIST: define internal void @bar
+; EMPTYFILE: define internal void @bar
+; LIST2: define void @bar
+; MERGE: define void @bar
+define void @bar() {
         ret void
 }
diff --git a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
index 7b18a04e1160f..47cf3f0373e42 100644
--- a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
+++ b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -internalize -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list main -S | grep internal | count 3
 
 @A = global i32 0
 @B = alias i32* @A
diff --git a/test/Transforms/JumpThreading/crash.ll b/test/Transforms/JumpThreading/crash.ll
index b9c03544db81b..2fe87464c117e 100644
--- a/test/Transforms/JumpThreading/crash.ll
+++ b/test/Transforms/JumpThreading/crash.ll
@@ -511,3 +511,56 @@ lbl_260:                                          ; preds = %for.cond, %entry
 if.end:                                           ; preds = %for.cond
   ret void
 }
+
+define void @PR14233(i1 %cmp, i1 %cmp2, i1 %cmp3, i1 %cmp4) {
+entry:
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  br label %if.end
+
+cond.false:
+  br label %if.end
+
+if.end:
+  %A = phi i64 [ 0, %cond.true ], [ 1, %cond.false ]
+  br i1 %cmp2, label %bb, label %if.end2
+
+bb:
+  br label %if.end2
+
+if.end2:
+  %B = phi i64 [ ptrtoint (i8* ()* @PR14233.f1 to i64), %bb ], [ %A, %if.end ]
+  %cmp.ptr = icmp eq i64 %B, ptrtoint (i8* ()* @PR14233.f2 to i64)
+  br i1 %cmp.ptr, label %cond.true2, label %if.end3
+
+cond.true2:
+  br i1 %cmp3, label %bb2, label %ur
+
+bb2:
+  br i1 %cmp4, label %if.end4, label %if.end3
+
+if.end4:
+  unreachable
+
+if.end3:
+  %cmp.ptr2 = icmp eq i64 %B, ptrtoint (i8* ()* @PR14233.f2 to i64)
+  br i1 %cmp.ptr2, label %ur, label %if.then601
+
+if.then601:
+  %C = icmp eq i64 %B, 0
+  br i1 %C, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+
+ur:
+  unreachable
+}
+
+declare i8* @PR14233.f1()
+
+declare i8* @PR14233.f2()
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 8a81857736a7e..9676efec9df27 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -121,3 +121,39 @@ L4:
   call void @quux()
   br label %L0
 }
+
+; Make sure the edge value of %0 from entry to L2 includes 0 and L3 is
+; reachable.
+; CHECK: test_switch_default
+; CHECK: entry:
+; CHECK: load
+; CHECK: switch
+; CHECK: [[THREADED:[A-Za-z.0-9]+]]:
+; CHECK: store
+; CHECK: br
+; CHECK: L2:
+; CHECK: icmp
+define void @test_switch_default(i32* nocapture %status) nounwind {
+entry:
+  %0 = load i32* %status, align 4
+  switch i32 %0, label %L2 [
+    i32 5061, label %L1
+    i32 0, label %L2
+  ]
+
+L1:
+  store i32 10025, i32* %status, align 4
+  br label %L2
+
+L2:
+  %1 = load i32* %status, align 4
+  %cmp57.i = icmp eq i32 %1, 0
+  br i1 %cmp57.i, label %L3, label %L4
+
+L3:
+  store i32 10000, i32* %status, align 4
+  br label %L4
+
+L4:
+  ret void
+}
diff --git a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
index 67c3951d74e4a..fe8d445313228 100644
--- a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
+++ b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -licm | lli
+; RUN: opt < %s -licm | lli %defaultjit
 
 define i32 @main() {
 entry:
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 6f28d53af66ea..98f93345e3c3a 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -29,7 +29,7 @@ Out:		; preds = %LoopTail
 }
 
 
-declare void @foo2(i32)
+declare void @foo2(i32) nounwind
 
 
 ;; It is ok and desirable to hoist this potentially trapping instruction.
@@ -64,3 +64,29 @@ Out:		; preds = %Loop
 	%C = sub i32 %A, %B		; <i32> [#uses=1]
 	ret i32 %C
 }
+
+; CHECK: @test4
+; CHECK: call
+; CHECK: sdiv
+; CHECK: ret
+define i32 @test4(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %n.01 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  call void @foo_may_call_exit(i32 0)
+  %div = sdiv i32 %x, %y
+  %add = add nsw i32 %n.01, %div
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %n.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %n.0.lcssa
+}
+
+declare void @foo_may_call_exit(i32)
+
diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll
index 46ab7e5542b6b..06a5bd90864db 100644
--- a/test/Transforms/LoopIdiom/basic.ll
+++ b/test/Transforms/LoopIdiom/basic.ll
@@ -383,4 +383,37 @@ for.end:                                          ; preds = %for.inc
 
 }
 
+define void @PR14241(i32* %s, i64 %size) {
+; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
+; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
+; instead of a memmove. If we get the memmove transform back, this will catch
+; regressions.
+;
+; CHECK: @PR14241
 
+entry:
+  %end.idx = add i64 %size, -1
+  %end.ptr = getelementptr inbounds i32* %s, i64 %end.idx
+  br label %while.body
+; CHECK-NOT: memcpy
+;
+; FIXME: When we regain the ability to form a memmove here, this test should be
+; reversed and turned into a positive assertion.
+; CHECK-NOT: memmove
+
+while.body:
+  %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
+  %src.ptr = getelementptr inbounds i32* %phi.ptr, i64 1
+  %val = load i32* %src.ptr, align 4
+; CHECK: load
+  %dst.ptr = getelementptr inbounds i32* %phi.ptr, i64 0
+  store i32 %val, i32* %dst.ptr, align 4
+; CHECK: store
+  %next.ptr = getelementptr inbounds i32* %phi.ptr, i64 1
+  %cmp = icmp eq i32* %next.ptr, %end.ptr
+  br i1 %cmp, label %exit, label %while.body
+
+exit:
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/LoopIdiom/crash.ll b/test/Transforms/LoopIdiom/crash.ll
new file mode 100644
index 0000000000000..969adbcd7635d
--- /dev/null
+++ b/test/Transforms/LoopIdiom/crash.ll
@@ -0,0 +1,25 @@
+; RUN: opt -basicaa -loop-idiom -S < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't crash inside DependenceAnalysis
+; PR14219
+define void @test1(i64* %iwork, i64 %x)  {
+bb0:
+  %mul116 = mul nsw i64 %x, %x
+  %incdec.ptr6.sum175 = add i64 42, %x
+  %arrayidx135 = getelementptr inbounds i64* %iwork, i64 %incdec.ptr6.sum175
+  br label %bb1
+bb1:
+  %storemerge4226 = phi i64 [ 0, %bb0 ], [ %inc139, %bb1 ]
+  store i64 1, i64* %arrayidx135, align 8
+  %incdec.ptr6.sum176 = add i64 %mul116, %storemerge4226
+  %arrayidx137 = getelementptr inbounds i64* %iwork, i64 %incdec.ptr6.sum176
+  store i64 1, i64* %arrayidx137, align 8
+  %inc139 = add nsw i64 %storemerge4226, 1
+  %cmp131 = icmp sgt i64 %storemerge4226, 42
+  br i1 %cmp131, label %bb2, label %bb1
+bb2:
+  ret void
+}
+
diff --git a/test/Transforms/LoopIdiom/non-canonical-loop.ll b/test/Transforms/LoopIdiom/non-canonical-loop.ll
new file mode 100644
index 0000000000000..a6a4f9227f9a4
--- /dev/null
+++ b/test/Transforms/LoopIdiom/non-canonical-loop.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -loop-idiom < %s
+; Don't crash
+; PR13892
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i32* %currMB) nounwind uwtable {
+entry:
+  br i1 undef, label %start.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+start.exit:                       ; preds = %entry
+  indirectbr i8* undef, [label %0, label %for.bodyprime]
+
+; <label>:0                                       ; preds = %start.exit
+  unreachable
+
+for.bodyprime:                                    ; preds = %for.bodyprime, %start.exit
+  %i.057375 = phi i32 [ 0, %start.exit ], [ %1, %for.bodyprime ]
+  %arrayidx8prime = getelementptr inbounds i32* %currMB, i32 %i.057375
+  store i32 0, i32* %arrayidx8prime, align 4
+  %1 = add i32 %i.057375, 1
+  %cmp5prime = icmp slt i32 %1, 4
+  br i1 %cmp5prime, label %for.bodyprime, label %for.endprime
+
+for.endprime:                                     ; preds = %for.bodyprime
+  br label %for.body23prime
+
+for.body23prime:                                  ; preds = %for.body23prime, %for.endprime
+  br label %for.body23prime
+}
diff --git a/test/Transforms/LoopIdiom/scev-invalidation.ll b/test/Transforms/LoopIdiom/scev-invalidation.ll
new file mode 100644
index 0000000000000..a244d9a280b9b
--- /dev/null
+++ b/test/Transforms/LoopIdiom/scev-invalidation.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S -indvars -loop-idiom < %s
+; PR14214
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @quote_arg() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %backslashes.0 = phi i32 [ undef, %entry ], [ %backslashes.2, %for.inc ]
+  %p.0 = phi i8* [ undef, %entry ], [ %incdec.ptr3, %for.inc ]
+  %q.0 = phi i8* [ undef, %entry ], [ %q.2, %for.inc ]
+  %0 = load i8* %p.0, align 1
+  switch i8 %0, label %while.cond.preheader [
+    i8 0, label %for.cond4.preheader
+    i8 92, label %for.inc
+  ]
+
+while.cond.preheader:                             ; preds = %for.cond
+  %tobool210 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool210, label %for.inc.loopexit, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.cond.preheader
+  %1 = add i32 %backslashes.0, -1
+  %2 = zext i32 %1 to i64
+  br label %while.body
+
+for.cond4.preheader:                              ; preds = %for.cond
+  %tobool57 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool57, label %for.end10, label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
+  br label %for.body6
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %q.112 = phi i8* [ %q.0, %while.body.lr.ph ], [ %incdec.ptr, %while.body ]
+  %backslashes.111 = phi i32 [ %backslashes.0, %while.body.lr.ph ], [ %dec, %while.body ]
+  %incdec.ptr = getelementptr inbounds i8* %q.112, i64 1
+  store i8 92, i8* %incdec.ptr, align 1
+  %dec = add nsw i32 %backslashes.111, -1
+  %tobool2 = icmp eq i32 %dec, 0
+  br i1 %tobool2, label %while.cond.for.inc.loopexit_crit_edge, label %while.body
+
+while.cond.for.inc.loopexit_crit_edge:            ; preds = %while.body
+  %scevgep.sum = add i64 %2, 1
+  %scevgep13 = getelementptr i8* %q.0, i64 %scevgep.sum
+  br label %for.inc.loopexit
+
+for.inc.loopexit:                                 ; preds = %while.cond.for.inc.loopexit_crit_edge, %while.cond.preheader
+  %q.1.lcssa = phi i8* [ %scevgep13, %while.cond.for.inc.loopexit_crit_edge ], [ %q.0, %while.cond.preheader ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.cond
+  %backslashes.2 = phi i32 [ %backslashes.0, %for.cond ], [ 0, %for.inc.loopexit ]
+  %q.2 = phi i8* [ %q.0, %for.cond ], [ %q.1.lcssa, %for.inc.loopexit ]
+  %incdec.ptr3 = getelementptr inbounds i8* %p.0, i64 1
+  br label %for.cond
+
+for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
+  %q.39 = phi i8* [ %q.0, %for.body6.lr.ph ], [ %incdec.ptr7, %for.body6 ]
+  %backslashes.38 = phi i32 [ %backslashes.0, %for.body6.lr.ph ], [ %dec9, %for.body6 ]
+  %incdec.ptr7 = getelementptr inbounds i8* %q.39, i64 1
+  store i8 92, i8* %incdec.ptr7, align 1
+  %dec9 = add nsw i32 %backslashes.38, -1
+  %tobool5 = icmp eq i32 %dec9, 0
+  br i1 %tobool5, label %for.cond4.for.end10_crit_edge, label %for.body6
+
+for.cond4.for.end10_crit_edge:                    ; preds = %for.body6
+  br label %for.end10
+
+for.end10:                                        ; preds = %for.cond4.for.end10_crit_edge, %for.cond4.preheader
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll
new file mode 100644
index 0000000000000..675d71f60da42
--- /dev/null
+++ b/test/Transforms/LoopRotate/multiple-exits.ll
@@ -0,0 +1,236 @@
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; PR7447
+define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
+  br i1 %i.0, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %for.cond, %land.rhs
+  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
+  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
+  %cmp2 = icmp ult i32 %i.1, 100
+  br i1 %cmp2, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  %conv = zext i32 %i.1 to i64
+  %arrayidx = getelementptr inbounds [100 x i32]* %a, i64 0, i64 %conv
+  %0 = load i32* %arrayidx, align 4
+  %add = add i32 %0, %sum.1
+  %cmp4 = icmp ugt i32 %add, 1000
+  %inc = add i32 %i.1, 1
+  br i1 %cmp4, label %return, label %for.cond1
+
+return:                                           ; preds = %for.cond, %land.rhs
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret i32 %retval.0
+
+; CHECK: @test1
+; CHECK: for.cond1.preheader:
+; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond1:
+; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
+; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
+; CHECK: %cmp2 = icmp ult i32 %i.1, 100
+; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
+}
+
+define void @test2(i32 %x) nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp eq i32 %i.0, %x
+  br i1 %cmp, label %return.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %call = tail call i32 @foo(i32 %i.0) nounwind
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %a
+
+if.end:                                           ; preds = %for.body
+  %call1 = tail call i32 @foo(i32 42) nounwind
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+a:                                                ; preds = %for.body
+  %call2 = tail call i32 @bar(i32 1) nounwind
+  br label %return
+
+return.loopexit:                                  ; preds = %for.cond
+  br label %return
+
+return:                                           ; preds = %return.loopexit, %a
+  ret void
+
+; CHECK: @test2
+; CHECK: if.end:
+; CHECK: %inc = add i32 %i.02, 1
+; CHECK: %cmp = icmp eq i32 %inc, %x
+; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+}
+
+declare i32 @foo(i32)
+
+declare i32 @bar(i32)
+
+@_ZTIi = external constant i8*
+
+; Verify dominators.
+define void @test3(i32 %x) {
+entry:
+  %cmp2 = icmp eq i32 0, %x
+  br i1 %cmp2, label %try.cont.loopexit, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  invoke void @_Z3fooi(i32 %i.03)
+          to label %for.inc unwind label %lpad
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.03, 1
+  %cmp = icmp eq i32 %inc, %x
+  br i1 %cmp, label %for.cond.try.cont.loopexit_crit_edge, label %for.body
+
+lpad:                                             ; preds = %for.body
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %4 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  br i1 true, label %invoke.cont2.loopexit, label %for.body.i.lr.ph
+
+for.body.i.lr.ph:                                 ; preds = %catch
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i.lr.ph, %for.inc.i
+  %i.0.i1 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc.i, %for.inc.i ]
+  invoke void @_Z3fooi(i32 %i.0.i1)
+          to label %for.inc.i unwind label %lpad.i
+
+for.inc.i:                                        ; preds = %for.body.i
+  %inc.i = add i32 %i.0.i1, 1
+  %cmp.i = icmp eq i32 %inc.i, 0
+  br i1 %cmp.i, label %for.cond.i.invoke.cont2.loopexit_crit_edge, label %for.body.i
+
+lpad.i:                                           ; preds = %for.body.i
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %6 = extractvalue { i8*, i32 } %5, 0
+  %7 = extractvalue { i8*, i32 } %5, 1
+  %matches.i = icmp eq i32 %7, %3
+  br i1 %matches.i, label %catch.i, label %lpad1.body
+
+catch.i:                                          ; preds = %lpad.i
+  %8 = tail call i8* @__cxa_begin_catch(i8* %6) nounwind
+  invoke void @test3(i32 0)
+          to label %invoke.cont2.i unwind label %lpad1.i
+
+invoke.cont2.i:                                   ; preds = %catch.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %invoke.cont2
+
+lpad1.i:                                          ; preds = %catch.i
+  %9 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %10 = extractvalue { i8*, i32 } %9, 0
+  %11 = extractvalue { i8*, i32 } %9, 1
+  tail call void @__cxa_end_catch() nounwind
+  br label %lpad1.body
+
+for.cond.i.invoke.cont2.loopexit_crit_edge:       ; preds = %for.inc.i
+  br label %invoke.cont2.loopexit
+
+invoke.cont2.loopexit:                            ; preds = %for.cond.i.invoke.cont2.loopexit_crit_edge, %catch
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %invoke.cont2.loopexit, %invoke.cont2.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %try.cont
+
+for.cond.try.cont.loopexit_crit_edge:             ; preds = %for.inc
+  br label %try.cont.loopexit
+
+try.cont.loopexit:                                ; preds = %for.cond.try.cont.loopexit_crit_edge, %entry
+  br label %try.cont
+
+try.cont:                                         ; preds = %try.cont.loopexit, %invoke.cont2
+  ret void
+
+lpad1.body:                                       ; preds = %lpad1.i, %lpad.i
+  %exn.slot.0.i = phi i8* [ %10, %lpad1.i ], [ %6, %lpad.i ]
+  %ehselector.slot.0.i = phi i32 [ %11, %lpad1.i ], [ %7, %lpad.i ]
+  tail call void @__cxa_end_catch() nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1.body, %lpad
+  %exn.slot.0 = phi i8* [ %exn.slot.0.i, %lpad1.body ], [ %1, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %ehselector.slot.0.i, %lpad1.body ], [ %2, %lpad ]
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+declare void @_Z3fooi(i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test4() nounwind uwtable {
+entry:
+  br label %"7"
+
+"3":                                              ; preds = %"7"
+  br i1 undef, label %"31", label %"4"
+
+"4":                                              ; preds = %"3"
+  %. = select i1 undef, float 0x3F50624DE0000000, float undef
+  %0 = add i32 %1, 1
+  br label %"7"
+
+"7":                                              ; preds = %"4", %entry
+  %1 = phi i32 [ %0, %"4" ], [ 0, %entry ]
+  %2 = icmp slt i32 %1, 100
+  br i1 %2, label %"3", label %"8"
+
+"8":                                              ; preds = %"7"
+  br i1 undef, label %"9", label %"31"
+
+"9":                                              ; preds = %"8"
+  br label %"33"
+
+"27":                                             ; preds = %"31"
+  unreachable
+
+"31":                                             ; preds = %"8", %"3"
+  br i1 undef, label %"27", label %"32"
+
+"32":                                             ; preds = %"31"
+  br label %"33"
+
+"33":                                             ; preds = %"32", %"9"
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
index a6996a81fb074..af3a53708b49f 100644
--- a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
+++ b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
@@ -1,15 +1,15 @@
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 ;
 ; Test LSR's use of SplitCriticalEdge during phi rewriting.
-; Verify that identical edges are merged. rdar://problem/6453893
 
 target triple = "x86-apple-darwin"
 
-; CHECK: @test
+; Verify that identical edges are merged. rdar://problem/6453893
+; CHECK: @test1
 ; CHECK: bb89:
 ; CHECK: phi i8* [ %lsr.iv.next1, %bbA.bb89_crit_edge ], [ %lsr.iv.next1, %bbB.bb89_crit_edge ]{{$}}
 
-define i8* @test() {
+define i8* @test1() {
 entry:
   br label %loop
 
@@ -41,3 +41,41 @@ bb89:
 exit:
   ret i8* %tmp75phi
 }
+
+; Handle single-predecessor phis: PR13756
+; CHECK: @test2
+; CHECK: bb89:
+; CHECK: phi i8* [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ]{{$}}
+define i8* @test2() {
+entry:
+  br label %loop
+
+loop:
+  %rec = phi i32 [ %next, %loop ], [ 0, %entry ]
+  %next = add i32 %rec, 1
+  %tmp75 = getelementptr i8* null, i32 %next
+  br i1 false, label %loop, label %loopexit
+
+loopexit:
+  br i1 false, label %bbA, label %bbB
+
+bbA:
+  switch i32 0, label %bb89 [
+    i32 47, label %bb89
+    i32 58, label %bb89
+  ]
+
+bbB:
+  switch i8 0, label %exit [
+    i8 47, label %exit
+    i8 58, label %exit
+  ]
+
+bb89:
+  %tmp75phi = phi i8* [ %tmp75, %bbA ], [ %tmp75, %bbA ], [ %tmp75, %bbA ]
+  br label %exit
+
+exit:
+  %result = phi i8* [ %tmp75phi, %bb89 ], [ %tmp75, %bbB ], [ %tmp75, %bbB ], [ %tmp75, %bbB ]
+  ret i8* %result
+}
diff --git a/test/Transforms/LoopUnroll/pr11361.ll b/test/Transforms/LoopUnroll/pr11361.ll
index 7ce7f5fe46005..62de2f728d239 100644
--- a/test/Transforms/LoopUnroll/pr11361.ll
+++ b/test/Transforms/LoopUnroll/pr11361.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unroll -disable-output
+; RUN: opt -loop-unroll -disable-output < %s
 ; PR11361
 
 ; This tests for an iterator invalidation issue.
diff --git a/test/Transforms/LoopUnroll/pr14167.ll b/test/Transforms/LoopUnroll/pr14167.ll
new file mode 100644
index 0000000000000..205ae44b72e48
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr14167.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @test1() nounwind {
+; Ensure that we don't crash when the trip count == -1.
+; CHECK: @test1
+entry:
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.end, %entry
+  br i1 false, label %middle.block, label %vector.ph
+
+vector.ph:                                        ; preds = %for.cond2.preheader
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  br i1 undef, label %middle.block.loopexit, label %vector.body
+
+middle.block.loopexit:                            ; preds = %vector.body
+  br label %middle.block
+
+middle.block:                                     ; preds = %middle.block.loopexit, %for.cond2.preheader
+  br i1 true, label %for.end, label %scalar.preheader
+
+scalar.preheader:                                 ; preds = %middle.block
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %scalar.preheader
+  %indvars.iv = phi i64 [ 16000, %scalar.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %for.body4, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body4
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %middle.block
+  br i1 undef, label %for.cond2.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.end
+  ret void
+}
diff --git a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
index 61c54ddb156b4..609520064a7a7 100644
--- a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
+++ b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unswitch -disable-output
+; RUN: opt -loop-unswitch -disable-output < %s
 ; PR10031
 
 define i32 @test(i32 %command) {
diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
new file mode 100644
index 0000000000000..0176c9a189666
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce
+
+; Check that we don't fall into an infinite loop.
+define void @test() nounwind {
+entry:
+ br label %for.body
+
+for.body:
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
+}
+
+
+
+define void @test2() nounwind {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ unreachable
+}
diff --git a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
new file mode 100644
index 0000000000000..2516e248bc964
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-width=4 
+
+; Check that we don't crash.
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22"
+
+@b = common global [32000 x float] zeroinitializer, align 16
+
+define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable {
+entry:
+  %0 = icmp sgt i32 %_n, 0
+  br i1 %0, label %"3.lr.ph", label %"5"
+
+"3.lr.ph":                                        ; preds = %entry
+  %1 = bitcast float* %arr to i8*
+  %2 = sext i32 %stride to i64
+  br label %"3"
+
+"3":                                              ; preds = %"3.lr.ph", %"3"
+  %indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ]
+  %3 = shl nsw i64 %indvars.iv, 2
+  %4 = getelementptr inbounds i8* %1, i64 %3
+  %5 = bitcast i8* %4 to float*
+  store float %value, float* %5, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, %2
+  %6 = trunc i64 %indvars.iv.next to i32
+  %7 = icmp slt i32 %6, %_n
+  br i1 %7, label %"3", label %"5"
+
+"5":                                              ; preds = %"3", %entry
+  ret i32 0
+}
+
+define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable {
+entry:
+  br label %"3"
+
+"3":                                              ; preds = %"3", %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %1 = getelementptr inbounds i8* bitcast (float* getelementptr inbounds ([32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0
+  %2 = bitcast i8* %1 to float*
+  store float -1.000000e+00, float* %2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %"5", label %"3"
+
+"5":                                              ; preds = %"3"
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"alias set 7: float", metadata !1}
+!1 = metadata !{metadata !1}
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
new file mode 100644
index 0000000000000..a2d176a534c93
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <8 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @read_mod_i64
+;CHECK: load <8 x i64>
+;CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64* %a, i64 %indvars.iv
+  %3 = load i64* %2, align 4
+  %4 = mul i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
new file mode 100644
index 0000000000000..8f1bb545fa019
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @conversion_cost1
+;CHECK: store <2 x i8>
+;CHECK: ret
+define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 3
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
+  %2 = trunc i64 %indvars.iv to i8
+  %3 = getelementptr inbounds i8* %A, i64 %indvars.iv
+  store i8 %2, i8* %3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+;CHECK: @conversion_cost2
+;CHECK: <2 x float>
+;CHECK: ret
+define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = add nsw i64 %indvars.iv, 3
+  %3 = trunc i64 %2 to i32
+  %4 = sitofp i32 %3 to float
+  %5 = getelementptr inbounds float* %B, i64 %indvars.iv
+  store float %4, float* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
new file mode 100644
index 0000000000000..628f9912c8c94
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@c = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: cost_model_1
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
new file mode 100644
index 0000000000000..574c529834ac6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK: @example1
+;CHECK: load <8 x i32>
+;CHECK: add nsw <8 x i32>
+;CHECK: store <8 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. 
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
new file mode 100644
index 0000000000000..a8ad0f1a28b23
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
new file mode 100644
index 0000000000000..26902eba9e295
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @cpp_new_arrays
+;CHECK: insertelement <4 x i32>
+;CHECK: load <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK: ret i32
+define i32 @cpp_new_arrays() uwtable ssp {
+entry:
+  %call = call noalias i8* @_Znwm(i64 4)
+  %0 = bitcast i8* %call to float*
+  store float 1.000000e+03, float* %0, align 4
+  %call1 = call noalias i8* @_Znwm(i64 4)
+  %1 = bitcast i8* %call1 to float*
+  store float 1.000000e+03, float* %1, align 4
+  %call3 = call noalias i8* @_Znwm(i64 4)
+  %2 = bitcast i8* %call3 to float*
+  store float 1.000000e+03, float* %2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float* %0, i64 %idxprom
+  %3 = load float* %arrayidx, align 4
+  %idxprom5 = sext i32 %i.01 to i64
+  %arrayidx6 = getelementptr inbounds float* %1, i64 %idxprom5
+  %4 = load float* %arrayidx6, align 4
+  %add = fadd float %3, %4
+  %idxprom7 = sext i32 %i.01 to i64
+  %arrayidx8 = getelementptr inbounds float* %2, i64 %idxprom7
+  store float %add, float* %arrayidx8, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %5 = load float* %2, align 4
+  %conv10 = fptosi float %5 to i32
+  ret i32 %conv10
+}
+
+declare noalias i8* @_Znwm(i64)
diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
new file mode 100644
index 0000000000000..2f22a764572f9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/flags.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @flags1
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags1(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul nsw i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @flags2
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
new file mode 100644
index 0000000000000..fce29d2404873
--- /dev/null
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -0,0 +1,650 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example2
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; We can't vectorize this loop because it has non constant loop bounds.
+;CHECK: @example3
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32* %.023, i64 1
+  %4 = load i32* %.023, align 16
+  %5 = getelementptr inbounds i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+;CHECK: @example4
+;CHECK: load <4 x i32>
+;CHECK: ret void
+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = add nsw i32 %n, -1
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %.preheader4, label %.lr.ph10
+
+.preheader4:                                      ; preds = %0
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %.lr.ph6, label %._crit_edge
+
+.lr.ph10:                                         ; preds = %0, %.lr.ph10
+  %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ]
+  %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ]
+  %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ]
+  %5 = getelementptr inbounds i32* %.027, i64 1
+  %6 = load i32* %.027, align 16
+  %7 = add nsw i32 %6, 5
+  %8 = getelementptr inbounds i32* %.018, i64 1
+  store i32 %7, i32* %.018, align 16
+  %9 = add nsw i32 %4, -1
+  %10 = icmp eq i32 %4, 0
+  br i1 %10, label %._crit_edge, label %.lr.ph10
+
+.preheader:                                       ; preds = %.lr.ph6
+  br i1 %3, label %.lr.ph, label %._crit_edge
+
+.lr.ph6:                                          ; preds = %.preheader4, %.lr.ph6
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ]
+  %indvars.iv.next12 = add i64 %indvars.iv11, 1
+  %11 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12
+  %12 = load i32* %11, align 4
+  %13 = add nsw i64 %indvars.iv11, 3
+  %14 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %13
+  %15 = load i32* %14, align 4
+  %16 = add nsw i32 %15, %12
+  %17 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv11
+  store i32 %16, i32* %17, align 4
+  %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32
+  %exitcond14 = icmp eq i32 %lftr.wideiv13, %1
+  br i1 %exitcond14, label %.preheader, label %.lr.ph6
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ]
+  %18 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %19 = load i32* %18, align 4
+  %20 = icmp sgt i32 %19, 4
+  %21 = select i1 %20, i32 4, i32 0
+  %22 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  store i32 %21, i32* %22, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %1
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK: @example8
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example8(i32 %x) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %3, %0
+  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ]
+  br label %1
+
+; <label>:1                                       ; preds = %1, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv
+  store i32 %x, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %3, label %1
+
+; <label>:3                                       ; preds = %1
+  %indvars.iv.next4 = add i64 %indvars.iv3, 1
+  %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32
+  %exitcond6 = icmp eq i32 %lftr.wideiv5, 32
+  br i1 %exitcond6, label %4, label %.preheader
+
+; <label>:4                                       ; preds = %3
+  ret void
+}
+
+;CHECK: @example9
+;CHECK: phi <4 x i32>
+;CHECK: ret i32
+define i32 @example9() nounwind uwtable readonly ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds [1024 x i32]* @ub, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [1024 x i32]* @uc, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add i32 %3, %diff.01
+  %7 = sub i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i32 %7
+}
+
+;CHECK: @example10a
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: load <4 x i16>
+;CHECK: add <4 x i16>
+;CHECK: store <4 x i16>
+;CHECK: ret void
+define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32* %ib, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %ic, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %8 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %9 = load i16* %8, align 2
+  %10 = getelementptr inbounds i16* %sc, i64 %indvars.iv
+  %11 = load i16* %10, align 2
+  %12 = add i16 %11, %9
+  %13 = getelementptr inbounds i16* %sa, i64 %indvars.iv
+  store i16 %12, i16* %13, align 2
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %14, label %1
+
+; <label>:14                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example11
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @example11() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = shl nsw i64 %indvars.iv, 1
+  %3 = or i64 %2, 1
+  %4 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %3
+  %5 = load i32* %4, align 4
+  %6 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %3
+  %7 = load i32* %6, align 4
+  %8 = mul nsw i32 %7, %5
+  %9 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %2
+  %10 = load i32* %9, align 8
+  %11 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %2
+  %12 = load i32* %11, align 8
+  %13 = mul nsw i32 %12, %10
+  %14 = sub nsw i32 %8, %13
+  %15 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %14, i32* %15, align 4
+  %16 = mul nsw i32 %7, %10
+  %17 = mul nsw i32 %12, %5
+  %18 = add nsw i32 %17, %16
+  %19 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  store i32 %18, i32* %19, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %20, label %1
+
+; <label>:20                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example12
+;CHECK: trunc <4 x i64>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example13
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %14, %0
+  %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ]
+  %1 = getelementptr inbounds i32** %A, i64 %indvars.iv4
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32** %B, i64 %indvars.iv4
+  %4 = load i32** %3, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %.preheader, %5
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ]
+  %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ]
+  %6 = getelementptr inbounds i32* %2, i64 %indvars.iv
+  %7 = load i32* %6, align 4
+  %8 = getelementptr inbounds i32* %4, i64 %indvars.iv
+  %9 = load i32* %8, align 4
+  %10 = add i32 %7, %diff.02
+  %11 = sub i32 %10, %9
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %12 = trunc i64 %indvars.iv.next to i32
+  %13 = icmp slt i32 %12, 1024
+  br i1 %13, label %5, label %14
+
+; <label>:14                                      ; preds = %5
+  %15 = getelementptr inbounds i32* %out, i64 %indvars.iv4
+  store i32 %11, i32* %15, align 4
+  %indvars.iv.next5 = add i64 %indvars.iv4, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 32
+  br i1 %exitcond, label %16, label %.preheader
+
+; <label>:16                                      ; preds = %14
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example14
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp {
+.preheader3:
+  br label %.preheader
+
+.preheader:                                       ; preds = %11, %.preheader3
+  %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ]
+  %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ]
+  br label %0
+
+; <label>:0                                       ; preds = %0, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ]
+  %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ]
+  %1 = getelementptr inbounds i32** %in, i64 %indvars.iv
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i64 %indvars.iv7
+  %4 = load i32* %3, align 4
+  %5 = getelementptr inbounds i32** %coeff, i64 %indvars.iv
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i64 %indvars.iv7
+  %8 = load i32* %7, align 4
+  %9 = mul nsw i32 %8, %4
+  %10 = add nsw i32 %9, %sum.12
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %11, label %0
+
+; <label>:11                                      ; preds = %0
+  %indvars.iv.next8 = add i64 %indvars.iv7, 1
+  %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32
+  %exitcond10 = icmp eq i32 %lftr.wideiv9, 32
+  br i1 %exitcond10, label %.preheader3.1, label %.preheader
+
+.preheader3.1:                                    ; preds = %11
+  store i32 %10, i32* %out, align 4
+  br label %.preheader.1
+
+.preheader.1:                                     ; preds = %24, %.preheader3.1
+  %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ]
+  %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ]
+  br label %12
+
+; <label>:12                                      ; preds = %12, %.preheader.1
+  %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ]
+  %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ]
+  %13 = add nsw i64 %indvars.iv.1, 1
+  %14 = getelementptr inbounds i32** %in, i64 %13
+  %15 = load i32** %14, align 8
+  %16 = getelementptr inbounds i32* %15, i64 %indvars.iv7.1
+  %17 = load i32* %16, align 4
+  %18 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.1
+  %19 = load i32** %18, align 8
+  %20 = getelementptr inbounds i32* %19, i64 %indvars.iv7.1
+  %21 = load i32* %20, align 4
+  %22 = mul nsw i32 %21, %17
+  %23 = add nsw i32 %22, %sum.12.1
+  %lftr.wideiv.1 = trunc i64 %13 to i32
+  %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024
+  br i1 %exitcond.1, label %24, label %12
+
+; <label>:24                                      ; preds = %12
+  %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1
+  %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32
+  %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32
+  br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1
+
+.preheader3.2:                                    ; preds = %24
+  %25 = getelementptr inbounds i32* %out, i64 1
+  store i32 %23, i32* %25, align 4
+  br label %.preheader.2
+
+.preheader.2:                                     ; preds = %38, %.preheader3.2
+  %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ]
+  %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ]
+  br label %26
+
+; <label>:26                                      ; preds = %26, %.preheader.2
+  %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ]
+  %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ]
+  %27 = add nsw i64 %indvars.iv.2, 2
+  %28 = getelementptr inbounds i32** %in, i64 %27
+  %29 = load i32** %28, align 8
+  %30 = getelementptr inbounds i32* %29, i64 %indvars.iv7.2
+  %31 = load i32* %30, align 4
+  %32 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.2
+  %33 = load i32** %32, align 8
+  %34 = getelementptr inbounds i32* %33, i64 %indvars.iv7.2
+  %35 = load i32* %34, align 4
+  %36 = mul nsw i32 %35, %31
+  %37 = add nsw i32 %36, %sum.12.2
+  %indvars.iv.next.2 = add i64 %indvars.iv.2, 1
+  %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
+  %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024
+  br i1 %exitcond.2, label %38, label %26
+
+; <label>:38                                      ; preds = %26
+  %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1
+  %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32
+  %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32
+  br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2
+
+.preheader3.3:                                    ; preds = %38
+  %39 = getelementptr inbounds i32* %out, i64 2
+  store i32 %37, i32* %39, align 4
+  br label %.preheader.3
+
+.preheader.3:                                     ; preds = %52, %.preheader3.3
+  %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ]
+  %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ]
+  br label %40
+
+; <label>:40                                      ; preds = %40, %.preheader.3
+  %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ]
+  %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ]
+  %41 = add nsw i64 %indvars.iv.3, 3
+  %42 = getelementptr inbounds i32** %in, i64 %41
+  %43 = load i32** %42, align 8
+  %44 = getelementptr inbounds i32* %43, i64 %indvars.iv7.3
+  %45 = load i32* %44, align 4
+  %46 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.3
+  %47 = load i32** %46, align 8
+  %48 = getelementptr inbounds i32* %47, i64 %indvars.iv7.3
+  %49 = load i32* %48, align 4
+  %50 = mul nsw i32 %49, %45
+  %51 = add nsw i32 %50, %sum.12.3
+  %indvars.iv.next.3 = add i64 %indvars.iv.3, 1
+  %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32
+  %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024
+  br i1 %exitcond.3, label %52, label %40
+
+; <label>:52                                      ; preds = %40
+  %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1
+  %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32
+  %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32
+  br i1 %exitcond10.3, label %53, label %.preheader.3
+
+; <label>:53                                      ; preds = %52
+  %54 = getelementptr inbounds i32* %out, i64 3
+  store i32 %51, i32* %54, align 4
+  ret void
+}
+
+; Can't vectorize because the src and dst pointers are not disjoint.
+;CHECK: @example21
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ]
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %a.02
+  %7 = trunc i64 %indvars.iv.next to i32
+  %8 = icmp sgt i32 %7, 0
+  br i1 %8, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ]
+  ret i32 %a.0.lcssa
+}
+
+; Can't vectorize because there are multiple PHIs.
+;CHECK: @example23
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example24
+;CHECK: shufflevector <4 x i16>
+;CHECK: ret void
+define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @fa, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @fb, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %x.y = select i1 %6, i16 %x, i16 %y
+  %7 = sext i16 %x.y to i32
+  %8 = getelementptr inbounds [1024 x i32]* @ic, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %8, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example25
+;CHECK: and <4 x i1>
+;CHECK: zext <4 x i1>
+;CHECK: ret void
+define void @example25() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @da, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @db, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %7 = getelementptr inbounds [1024 x float]* @dc, i64 0, i64 %indvars.iv
+  %8 = load float* %7, align 4
+  %9 = getelementptr inbounds [1024 x float]* @dd, i64 0, i64 %indvars.iv
+  %10 = load float* %9, align 4
+  %11 = fcmp olt float %8, %10
+  %12 = and i1 %6, %11
+  %13 = zext i1 %12 to i32
+  %14 = getelementptr inbounds [1024 x i32]* @dj, i64 0, i64 %indvars.iv
+  store i32 %13, i32* %14, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %15, label %1
+
+; <label>:15                                      ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
new file mode 100644
index 0000000000000..71ea7689fc049
--- /dev/null
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK: @inc
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK: @histogram
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %1 = load i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
new file mode 100644
index 0000000000000..b31bceb50df6b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@array = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @array_at_plus_one
+;CHECK: add <4 x i64>
+;CHECK: trunc <4 x i64>
+;CHECK: add i64 %index, 12
+;CHECK: ret i32
+define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = add nsw i64 %indvars.iv, 12
+  %3 = getelementptr inbounds [1024 x i32]* @array, i64 0, i64 %2
+  %4 = trunc i64 %indvars.iv to i32
+  store i32 %4, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/lit.local.cfg b/test/Transforms/LoopVectorize/lit.local.cfg
new file mode 100644
index 0000000000000..19eebc0ac7ac3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
new file mode 100644
index 0000000000000..1a6c15ed96c42
--- /dev/null
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: shl i32
+;CHECK: zext i32
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i32 %n) nounwind uwtable ssp {
+  %n4 = shl i32 %n, 2
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n4
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
new file mode 100644
index 0000000000000..b4d1bac132f06
--- /dev/null
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_only_func
+;CHECK: load <4 x i32>
+;CHECK: ret i32
+define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i64 %indvars.iv, 13
+  %5 = getelementptr inbounds i32* %B, i64 %4
+  %6 = load i32* %5, align 4
+  %7 = shl i32 %6, 1
+  %8 = add i32 %3, %sum.02
+  %9 = add i32 %8, %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
new file mode 100644
index 0000000000000..c1848b35fc6e1
--- /dev/null
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -0,0 +1,232 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @reduction_sum
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_prod
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+;CHECK: @reduction_mix
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_mul
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %3, %6
+  %8 = add i32 %7, %5
+  %9 = mul i32 %8, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @start_at_non_zero
+;CHECK: phi <4 x i32>
+;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: ret i32
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_and
+;CHECK: and <4 x i32>
+;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: ret i32
+define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %and = and i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_or
+;CHECK: or <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_xor
+;CHECK: xor <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
new file mode 100644
index 0000000000000..23933cf7c7dba
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Make sure we vectorize this loop:
+; int foo(float *a, float *b, int n) {
+;   for (int i=0; i<n; ++i)
+;     a[i] = b[i] * 3;
+; }
+
+;CHECK: load <4 x float>
+define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %mul = fmul float %0, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
new file mode 100644
index 0000000000000..e537bde31bb0f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+; make sure that we have a scalar condition and a vector operand.
+;CHECK: select i1 %cond, <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
new file mode 100644
index 0000000000000..4a6e4b231dfe5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count.
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
new file mode 100644
index 0000000000000..5aa3bc034d0b0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @start_at_nonzero
+;CHECK: mul nuw <4 x i32>
+;CHECK: ret i32
+define i32 @start_at_nonzero(i32* nocapture %a, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp3 = icmp slt i32 %start, %end
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx, align 4, !tbaa !0
+  %mul = mul nuw i32 %1, 333
+  store i32 %mul, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %2, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 4
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
new file mode 100644
index 0000000000000..eb027604134f3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/write-only.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <4 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
index 61ba3c7e6cc55..597b69dee3d4a 100644
--- a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
+++ b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
@@ -9,11 +9,11 @@ declare void @g(%a*)
 define float @f() {
 entry:
   %a_var = alloca %a
-  %b_var = alloca %b
+  %b_var = alloca %b, align 1
   call void @g(%a* %a_var)
   %a_i8 = bitcast %a* %a_var to i8*
   %b_i8 = bitcast %b* %b_var to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 4, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 1, i1 false)
   %tmp1 = getelementptr %b* %b_var, i32 0, i32 0
   %tmp2 = load float* %tmp1
   ret float %tmp2
diff --git a/test/Transforms/MemCpyOpt/align.ll b/test/Transforms/MemCpyOpt/align.ll
index b1f900d9da4c6..1b98f6ad383f9 100644
--- a/test/Transforms/MemCpyOpt/align.ll
+++ b/test/Transforms/MemCpyOpt/align.ll
@@ -1,12 +1,15 @@
-; RUN: opt < %s -S -memcpyopt | FileCheck %s
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
 ; The resulting memset is only 4-byte aligned, despite containing
 ; a 16-byte aligned store in the middle.
 
-; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
-
 define void @foo(i32* %p) {
+; CHECK: @foo
+; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
   %a0 = getelementptr i32* %p, i64 0
   store i32 0, i32* %a0, align 4
   %a1 = getelementptr i32* %p, i64 1
@@ -17,3 +20,18 @@ define void @foo(i32* %p) {
   store i32 0, i32* %a3, align 4
   ret void
 }
+
+; Replacing %a8 with %a4 in the memset requires boosting the alignment of %a4.
+
+define void @bar() {
+; CHECK: @bar
+; CHECK: %a4 = alloca i32, align 8
+; CHECK-NOT: memcpy
+  %a4 = alloca i32, align 4
+  %a8 = alloca i32, align 8
+  %a8.cast = bitcast i32* %a8 to i8*
+  %a4.cast = bitcast i32* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a8.cast, i8 0, i64 4, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a4.cast, i8* %a8.cast, i64 4, i32 4, i1 false)
+  ret void
+}
diff --git a/test/Transforms/MemCpyOpt/form-memset.ll b/test/Transforms/MemCpyOpt/form-memset.ll
index 8832f897b089f..f63b1dcfdd5f6 100644
--- a/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/test/Transforms/MemCpyOpt/form-memset.ll
@@ -248,3 +248,27 @@ entry:
 ; CHECK: @test8
 ; CHECK: store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16
 }
+
+@test9buf = internal unnamed_addr global [16 x i64] zeroinitializer, align 16
+
+define void @test9() nounwind {
+  store i8 -1, i8* bitcast ([16 x i64]* @test9buf to i8*), align 16
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 1), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 2), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 3), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 4), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 5), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 6), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 7), align 1
+  store i8 -1, i8* bitcast (i64* getelementptr inbounds ([16 x i64]* @test9buf, i64 0, i64 1) to i8*), align 8
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 9), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 10), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 11), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 12), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 13), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 14), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 15), align 1
+  ret void
+; CHECK: @test9(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i32 16, i1 false)
+}
diff --git a/test/Transforms/MetaRenamer/lit.local.cfg b/test/Transforms/MetaRenamer/lit.local.cfg
new file mode 100644
index 0000000000000..c6106e4746f2d
--- /dev/null
+++ b/test/Transforms/MetaRenamer/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
new file mode 100644
index 0000000000000..ad41bcf50f194
--- /dev/null
+++ b/test/Transforms/MetaRenamer/metarenamer.ll
@@ -0,0 +1,96 @@
+; RUN: opt %s -metarenamer -S | FileCheck %s
+
+; CHECK: target triple {{.*}}
+; CHECK-NOT: {{^x*}}xxx{{^x*}}
+; CHECK: ret i32 6
+
+target triple = "x86_64-pc-linux-gnu"
+
+%struct.bar_xxx = type { i32, double }
+%struct.foo_xxx = type { i32, float, %struct.bar_xxx }
+
+@func_5_xxx.static_local_3_xxx = internal global i32 3, align 4
+@global_3_xxx = common global i32 0, align 4
+
+@func_7_xxx = alias weak i32 (...)* @aliased_func_7_xxx
+
+declare i32 @aliased_func_7_xxx(...)
+
+define i32 @func_3_xxx() nounwind uwtable ssp {
+  ret i32 3
+}
+
+define void @func_4_xxx(%struct.foo_xxx* sret %agg.result) nounwind uwtable ssp {
+  %1 = alloca %struct.foo_xxx, align 8
+  %2 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 0
+  store i32 1, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 1
+  store float 2.000000e+00, float* %3, align 4
+  %4 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 2
+  %5 = getelementptr inbounds %struct.bar_xxx* %4, i32 0, i32 0
+  store i32 3, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.bar_xxx* %4, i32 0, i32 1
+  store double 4.000000e+00, double* %6, align 8
+  %7 = bitcast %struct.foo_xxx* %agg.result to i8*
+  %8 = bitcast %struct.foo_xxx* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 24, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define i32 @func_5_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, i32 %arg_3_xxx, i32 %arg_4_xxx) nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %local_1_xxx = alloca i32, align 4
+  %local_2_xxx = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %arg_1_xxx, i32* %1, align 4
+  store i32 %arg_2_xxx, i32* %2, align 4
+  store i32 %arg_3_xxx, i32* %3, align 4
+  store i32 %arg_4_xxx, i32* %4, align 4
+  store i32 1, i32* %local_1_xxx, align 4
+  store i32 2, i32* %local_2_xxx, align 4
+  store i32 0, i32* %i, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %9, %0
+  %6 = load i32* %i, align 4
+  %7 = icmp slt i32 %6, 10
+  br i1 %7, label %8, label %12
+
+; <label>:8                                       ; preds = %5
+  br label %9
+
+; <label>:9                                       ; preds = %8
+  %10 = load i32* %i, align 4
+  %11 = add nsw i32 %10, 1
+  store i32 %11, i32* %i, align 4
+  br label %5
+
+; <label>:12                                      ; preds = %5
+  %13 = load i32* %local_1_xxx, align 4
+  %14 = load i32* %1, align 4
+  %15 = add nsw i32 %13, %14
+  %16 = load i32* %local_2_xxx, align 4
+  %17 = add nsw i32 %15, %16
+  %18 = load i32* %2, align 4
+  %19 = add nsw i32 %17, %18
+  %20 = load i32* @func_5_xxx.static_local_3_xxx, align 4
+  %21 = add nsw i32 %19, %20
+  %22 = load i32* %3, align 4
+  %23 = add nsw i32 %21, %22
+  %24 = load i32* %4, align 4
+  %25 = add nsw i32 %23, %24
+  ret i32 %25
+}
+
+define i32 @varargs_func_6_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, ...) nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 %arg_1_xxx, i32* %1, align 4
+  store i32 %arg_2_xxx, i32* %2, align 4
+  ret i32 6
+}
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 0a7ba5de71bc7..7b64b1be7c622 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -objc-arc -S < %s | FileCheck %s
+; RUN: opt -basicaa -objc-arc -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64"
 
@@ -1498,7 +1498,7 @@ define i8* @test49(i8* %p) nounwind {
 }
 
 ; Do delete retain+release with intervening stores of the
-; address value;
+; address value.
 
 ; CHECK: define void @test50(
 ; CHECK-NOT: @objc_
diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index a618a21d8bb39..32be03ec6ae05 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll
@@ -16,6 +16,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 declare void @use(i8*)
 declare void @objc_release(i8*)
+declare i8* @def()
+declare void @__crasher_block_invoke(i8* nocapture)
+declare i8* @objc_retainBlock(i8*)
+declare void @__crasher_block_invoke1(i8* nocapture)
 
 !0 = metadata !{}
 
@@ -279,11 +283,13 @@ forcoll.empty:
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test6(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test6() nounwind {
 entry:
@@ -345,11 +351,13 @@ forcoll.empty:
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test7(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test7() nounwind {
 entry:
@@ -553,12 +561,12 @@ forcoll.empty:
   ret void
 }
 
-; Like test9, but without a split backedge. This we can optimize.
+; Like test9, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test9b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test9b() nounwind {
 entry:
@@ -687,12 +695,12 @@ forcoll.empty:
   ret void
 }
 
-; Like test10, but without a split backedge. This we can optimize.
+; Like test10, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test10b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test10b() nounwind {
 entry:
@@ -751,3 +759,64 @@ forcoll.empty:
   call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
+
+; Pointers to strong pointers can obscure provenance relationships. Be conservative
+; in the face of escaping pointers. rdar://12150909.
+
+%struct.__block_d = type { i64, i64 }
+
+@_NSConcreteStackBlock = external global i8*
+@__block_d_tmp = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+@__block_d_tmp5 = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+
+; CHECK: define void @test11(
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+; CHECK: }
+define void @test11() {
+entry:
+  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %block9 = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %call = call i8* @def(), !clang.arc.no_objc_arc_exceptions !0
+  %foo = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 5
+  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke to i8*), i8** %block.invoke, align 8
+  %block.d = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp to %struct.__block_d*), %struct.__block_d** %block.d, align 8
+  %foo2 = tail call i8* @objc_retain(i8* %call) nounwind
+  store i8* %foo2, i8** %foo, align 8
+  %foo4 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block to i8*
+  %foo5 = call i8* @objc_retainBlock(i8* %foo4) nounwind
+  call void @use(i8* %foo5), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo5) nounwind
+  %strongdestroy = load i8** %foo, align 8
+  call void @objc_release(i8* %strongdestroy) nounwind, !clang.imprecise_release !0
+  %foo10 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 5
+  %block.isa11 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa11, align 8
+  %block.flags12 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags12, align 8
+  %block.reserved13 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 2
+  store i32 0, i32* %block.reserved13, align 4
+  %block.invoke14 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke1 to i8*), i8** %block.invoke14, align 8
+  %block.d15 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp5 to %struct.__block_d*), %struct.__block_d** %block.d15, align 8
+  %foo18 = call i8* @objc_retain(i8* %call) nounwind
+  store i8* %call, i8** %foo10, align 8
+  %foo20 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9 to i8*
+  %foo21 = call i8* @objc_retainBlock(i8* %foo20) nounwind
+  call void @use(i8* %foo21), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo21) nounwind
+  %strongdestroy25 = load i8** %foo10, align 8
+  call void @objc_release(i8* %strongdestroy25) nounwind, !clang.imprecise_release !0
+  call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
new file mode 100644
index 0000000000000..e7866ed1b4421
--- /dev/null
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -0,0 +1,329 @@
+; RUN: opt -objc-arc -S < %s
+; rdar://12277446
+
+; The total number of paths grows exponentially with the number of branches, and a
+; computation of this number can overflow any reasonable fixed-sized integer.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768 = type { i32*, i32, i8*, i32 }
+
+@_unnamed_cfstring_591 = external constant %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768, section "__DATA,__cfstring"
+
+declare i8* @objc_retain(i8*) nonlazybind
+
+declare void @objc_release(i8*) nonlazybind
+
+define hidden void @foo() {
+entry:
+  br i1 undef, label %msgSend.nullinit, label %msgSend.call
+
+msgSend.call:                                     ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.nullinit:                                 ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
+  %0 = bitcast %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768* @_unnamed_cfstring_591 to i8*
+  %1 = call i8* @objc_retain(i8* %0) nounwind
+  br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
+
+msgSend.call32:                                   ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.nullinit33:                               ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.cont34:                                   ; preds = %msgSend.nullinit33, %msgSend.call32
+  br i1 undef, label %msgSend.nullinit38, label %msgSend.call37
+
+msgSend.call37:                                   ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.nullinit38:                               ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.cont39:                                   ; preds = %msgSend.nullinit38, %msgSend.call37
+  br i1 undef, label %msgSend.nullinit49, label %msgSend.call48
+
+msgSend.call48:                                   ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.nullinit49:                               ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.cont50:                                   ; preds = %msgSend.nullinit49, %msgSend.call48
+  br i1 undef, label %msgSend.nullinit61, label %msgSend.call60
+
+msgSend.call60:                                   ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.nullinit61:                               ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.cont62:                                   ; preds = %msgSend.nullinit61, %msgSend.call60
+  br i1 undef, label %msgSend.nullinit67, label %msgSend.call66
+
+msgSend.call66:                                   ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.nullinit67:                               ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.cont68:                                   ; preds = %msgSend.nullinit67, %msgSend.call66
+  br i1 undef, label %msgSend.nullinit84, label %msgSend.call83
+
+msgSend.call83:                                   ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.nullinit84:                               ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.cont85:                                   ; preds = %msgSend.nullinit84, %msgSend.call83
+  br i1 undef, label %msgSend.nullinit90, label %msgSend.call89
+
+msgSend.call89:                                   ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.nullinit90:                               ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.cont91:                                   ; preds = %msgSend.nullinit90, %msgSend.call89
+  br i1 undef, label %msgSend.nullinit104, label %msgSend.call103
+
+msgSend.call103:                                  ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.nullinit104:                              ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.cont105:                                  ; preds = %msgSend.nullinit104, %msgSend.call103
+  br i1 undef, label %land.lhs.true, label %if.end123
+
+land.lhs.true:                                    ; preds = %msgSend.cont105
+  br i1 undef, label %if.then117, label %if.end123
+
+if.then117:                                       ; preds = %land.lhs.true
+  br label %if.end123
+
+if.end123:                                        ; preds = %if.then117, %land.lhs.true, %msgSend.cont105
+  br i1 undef, label %msgSend.nullinit132, label %msgSend.call131
+
+msgSend.call131:                                  ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.nullinit132:                              ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.cont133:                                  ; preds = %msgSend.nullinit132, %msgSend.call131
+  br i1 undef, label %msgSend.nullinit139, label %msgSend.call138
+
+msgSend.call138:                                  ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.nullinit139:                              ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.cont140:                                  ; preds = %msgSend.nullinit139, %msgSend.call138
+  br i1 undef, label %if.then151, label %if.end157
+
+if.then151:                                       ; preds = %msgSend.cont140
+  br label %if.end157
+
+if.end157:                                        ; preds = %if.then151, %msgSend.cont140
+  br i1 undef, label %msgSend.nullinit164, label %msgSend.call163
+
+msgSend.call163:                                  ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.nullinit164:                              ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.cont165:                                  ; preds = %msgSend.nullinit164, %msgSend.call163
+  br i1 undef, label %msgSend.nullinit176, label %msgSend.call175
+
+msgSend.call175:                                  ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.nullinit176:                              ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.cont177:                                  ; preds = %msgSend.nullinit176, %msgSend.call175
+  br i1 undef, label %land.lhs.true181, label %if.end202
+
+land.lhs.true181:                                 ; preds = %msgSend.cont177
+  br i1 undef, label %if.then187, label %if.end202
+
+if.then187:                                       ; preds = %land.lhs.true181
+  br i1 undef, label %msgSend.nullinit199, label %msgSend.call198
+
+msgSend.call198:                                  ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.nullinit199:                              ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.cont200:                                  ; preds = %msgSend.nullinit199, %msgSend.call198
+  br label %if.end202
+
+if.end202:                                        ; preds = %msgSend.cont200, %land.lhs.true181, %msgSend.cont177
+  br i1 undef, label %msgSend.nullinit236, label %msgSend.call235
+
+msgSend.call235:                                  ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.nullinit236:                              ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.cont237:                                  ; preds = %msgSend.nullinit236, %msgSend.call235
+  br i1 undef, label %msgSend.nullinit254, label %msgSend.call253
+
+msgSend.call253:                                  ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.nullinit254:                              ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.cont255:                                  ; preds = %msgSend.nullinit254, %msgSend.call253
+  br i1 undef, label %msgSend.nullinit269, label %msgSend.call268
+
+msgSend.call268:                                  ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.nullinit269:                              ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.cont270:                                  ; preds = %msgSend.nullinit269, %msgSend.call268
+  br i1 undef, label %msgSend.nullinit281, label %msgSend.call280
+
+msgSend.call280:                                  ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.nullinit281:                              ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.cont282:                                  ; preds = %msgSend.nullinit281, %msgSend.call280
+  br i1 undef, label %msgSend.nullinit287, label %msgSend.call286
+
+msgSend.call286:                                  ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.nullinit287:                              ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.cont288:                                  ; preds = %msgSend.nullinit287, %msgSend.call286
+  br i1 undef, label %msgSend.nullinit303, label %msgSend.call302
+
+msgSend.call302:                                  ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.nullinit303:                              ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.cont304:                                  ; preds = %msgSend.nullinit303, %msgSend.call302
+  br i1 undef, label %msgSend.nullinit344, label %msgSend.call343
+
+msgSend.call343:                                  ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.nullinit344:                              ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.cont345:                                  ; preds = %msgSend.nullinit344, %msgSend.call343
+  br i1 undef, label %msgSend.nullinit350, label %msgSend.call349
+
+msgSend.call349:                                  ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.nullinit350:                              ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.cont351:                                  ; preds = %msgSend.nullinit350, %msgSend.call349
+  br i1 undef, label %msgSend.nullinit366, label %msgSend.call365
+
+msgSend.call365:                                  ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.nullinit366:                              ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.cont367:                                  ; preds = %msgSend.nullinit366, %msgSend.call365
+  br i1 undef, label %msgSend.nullinit376, label %msgSend.call375
+
+msgSend.call375:                                  ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.nullinit376:                              ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.cont377:                                  ; preds = %msgSend.nullinit376, %msgSend.call375
+  br i1 undef, label %if.then384, label %if.else401
+
+if.then384:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit392, label %msgSend.call391
+
+msgSend.call391:                                  ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.nullinit392:                              ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.cont393:                                  ; preds = %msgSend.nullinit392, %msgSend.call391
+  br label %if.end418
+
+if.else401:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit409, label %msgSend.call408
+
+msgSend.call408:                                  ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.nullinit409:                              ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.cont410:                                  ; preds = %msgSend.nullinit409, %msgSend.call408
+  br label %if.end418
+
+if.end418:                                        ; preds = %msgSend.cont410, %msgSend.cont393
+  br i1 undef, label %msgSend.nullinit470, label %msgSend.call469
+
+msgSend.call469:                                  ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.nullinit470:                              ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.cont471:                                  ; preds = %msgSend.nullinit470, %msgSend.call469
+  br i1 undef, label %msgSend.nullinit484, label %msgSend.call483
+
+msgSend.call483:                                  ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.nullinit484:                              ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.cont485:                                  ; preds = %msgSend.nullinit484, %msgSend.call483
+  br i1 undef, label %msgSend.nullinit500, label %msgSend.call499
+
+msgSend.call499:                                  ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.nullinit500:                              ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.cont501:                                  ; preds = %msgSend.nullinit500, %msgSend.call499
+  br i1 undef, label %msgSend.nullinit506, label %msgSend.call505
+
+msgSend.call505:                                  ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.nullinit506:                              ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.cont507:                                  ; preds = %msgSend.nullinit506, %msgSend.call505
+  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/PhaseOrdering/gdce.ll b/test/Transforms/PhaseOrdering/gdce.ll
new file mode 100644
index 0000000000000..273e47e97cb4b
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/gdce.ll
@@ -0,0 +1,106 @@
+; RUN: opt -O2 -S %s | FileCheck %s
+
+; Run global DCE to eliminate unused ctor and dtor.
+; rdar://9142819
+
+; CHECK: main
+; CHECK-NOT: _ZN4BaseC1Ev
+; CHECK-NOT: _ZN4BaseD1Ev
+; CHECK-NOT: _ZN4BaseD2Ev
+; CHECK-NOT: _ZN4BaseC2Ev
+; CHECK-NOT: _ZN4BaseD0Ev
+
+%class.Base = type { i32 (...)** }
+
+@_ZTV4Base = linkonce_odr unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI4Base to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD1Ev to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD0Ev to i8*)]
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTS4Base = linkonce_odr constant [6 x i8] c"4Base\00"
+@_ZTI4Base = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([6 x i8]* @_ZTS4Base, i32 0, i32 0) }
+
+define i32 @main() uwtable ssp {
+entry:
+  %retval = alloca i32, align 4
+  %b = alloca %class.Base, align 8
+  %cleanup.dest.slot = alloca i32
+  store i32 0, i32* %retval
+  call void @_ZN4BaseC1Ev(%class.Base* %b)
+  store i32 0, i32* %retval
+  store i32 1, i32* %cleanup.dest.slot
+  call void @_ZN4BaseD1Ev(%class.Base* %b)
+  %0 = load i32* %retval
+  ret i32 %0
+}
+
+define linkonce_odr void @_ZN4BaseC1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  call void @_ZN4BaseC2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  call void @_ZN4BaseD2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseC2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  %0 = bitcast %class.Base* %this1 to i8***
+  store i8** getelementptr inbounds ([4 x i8*]* @_ZTV4Base, i64 0, i64 2), i8*** %0
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD0Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  invoke void @_ZN4BaseD1Ev(%class.Base* %this1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %0 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %0) nounwind
+  ret void
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %2 = extractvalue { i8*, i32 } %1, 0
+  store i8* %2, i8** %exn.slot
+  %3 = extractvalue { i8*, i32 } %1, 1
+  store i32 %3, i32* %ehselector.slot
+  %4 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %4) nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad
+  %exn = load i8** %exn.slot
+  %sel = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val2
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZdlPv(i8*) nounwind
diff --git a/test/Transforms/Reassociate/crash.ll b/test/Transforms/Reassociate/crash.ll
index ce586e15fbcf0..e29b5dc9c0ce8 100644
--- a/test/Transforms/Reassociate/crash.ll
+++ b/test/Transforms/Reassociate/crash.ll
@@ -144,3 +144,31 @@ define i32 @sozefx_(i32 %x, i32 %y) {
   %t6 = add i32 %t4, %t5
   ret i32 %t6
 }
+
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
+  %tmp1 = mul i32 %arg1, 2
+  %tmp2 = mul i32 %tmp1, 3
+  %tmp3 = mul i32 %arg2, 2
+  %tmp4 = add i32 %tmp1, 1 ; dead code
+  %ret = add i32 %tmp2, %tmp3
+  ret i32 %ret
+}
+
+; PR14060
+define i8 @hang(i8 %p, i8 %p0, i8 %p1, i8 %p2, i8 %p3, i8 %p4, i8 %p5, i8 %p6, i8 %p7, i8 %p8, i8 %p9) {
+  %tmp = zext i1 false to i8
+  %tmp16 = or i8 %tmp, 1
+  %tmp22 = or i8 %p7, %p0
+  %tmp23 = or i8 %tmp16, %tmp22
+  %tmp28 = or i8 %p9, %p1
+  %tmp31 = or i8 %tmp23, %p2
+  %tmp32 = or i8 %tmp31, %tmp28
+  %tmp38 = or i8 %p8, %p3
+  %tmp39 = or i8 %tmp16, %tmp38
+  %tmp43 = or i8 %tmp39, %p4
+  %tmp44 = or i8 %tmp43, 1
+  %tmp47 = or i8 %tmp32, %p5
+  %tmp50 = or i8 %tmp47, %p6
+  %tmp51 = or i8 %tmp44, %tmp50
+  ret i8 %tmp51
+}
diff --git a/test/Transforms/SCCP/loadtest.ll b/test/Transforms/SCCP/loadtest.ll
index add2af483f560..dd1dba69143c7 100644
--- a/test/Transforms/SCCP/loadtest.ll
+++ b/test/Transforms/SCCP/loadtest.ll
@@ -1,8 +1,9 @@
 ; This test makes sure that these instructions are properly constant propagated.
 
-target datalayout = "e-p:32:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32" -sccp -S | FileCheck %s
 
-; RUN: opt < %s -sccp -S | not grep load
+; CHECK-NOT: load
 
 
 @X = constant i32 42		; <i32*> [#uses=1]
diff --git a/test/Transforms/SROA/alignment.ll b/test/Transforms/SROA/alignment.ll
new file mode 100644
index 0000000000000..ad5fb6c4a5d8c
--- /dev/null
+++ b/test/Transforms/SROA/alignment.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+
+define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
+; CHECK: @test1
+; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 0
+; CHECK: %[[a0:.*]] = load i8* %[[gep_a0]], align 16
+; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 1
+; CHECK: %[[a1:.*]] = load i8* %[[gep_a1]], align 1
+; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 0
+; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
+; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 1
+; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
+; CHECK: ret void
+
+entry:
+  %alloca = alloca { i8, i8 }, align 16
+  %gep_a = getelementptr { i8, i8 }* %a, i32 0, i32 0
+  %gep_alloca = getelementptr { i8, i8 }* %alloca, i32 0, i32 0
+  %gep_b = getelementptr { i8, i8 }* %b, i32 0, i32 0
+
+  store i8 420, i8* %gep_alloca, align 16
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_alloca, i8* %gep_a, i32 2, i32 16, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_b, i8* %gep_alloca, i32 2, i32 16, i1 false)
+  ret void
+}
+
+define void @test2() {
+; CHECK: @test2
+; CHECK: alloca i16
+; CHECK: load i8* %{{.*}}
+; CHECK: store i8 42, i8* %{{.*}}
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8, i8, i8, i8 }, align 2
+  %gep1 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 1
+  %cast1 = bitcast i8* %gep1 to i16*
+  store volatile i16 0, i16* %cast1
+  %gep2 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 2
+  %result = load i8* %gep2
+  store i8 42, i8* %gep2
+  ret void
+}
+
+define void @PR13920(<2 x i64>* %a, i16* %b) {
+; Test that alignments on memcpy intrinsics get propagated to loads and stores.
+; CHECK: @PR13920
+; CHECK: load <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
+
+entry:
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64>* %a to i8*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %aaptr, i8* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %bptr, i8* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+define void @test3(i8* %x) {
+; Test that when we promote an alloca to a type with lower ABI alignment, we
+; provide the needed explicit alignment that code using the alloca may be
+; expecting. However, also check that any offset within an alloca can in turn
+; reduce the alignment.
+; CHECK: @test3
+; CHECK: alloca [22 x i8], align 8
+; CHECK: alloca [18 x i8], align 2
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8*, i8*, i8* }
+  %b = alloca { i8*, i8*, i8* }
+  %a_raw = bitcast { i8*, i8*, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a_raw, i8* %x, i32 22, i32 8, i1 false)
+  %b_raw = bitcast { i8*, i8*, i8* }* %b to i8*
+  %b_gep = getelementptr i8* %b_raw, i32 6
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_gep, i8* %x, i32 18, i32 2, i1 false)
+  ret void
+}
+
+define void @test5() {
+; Test that we preserve underaligned loads and stores when splitting.
+; CHECK: @test5
+; CHECK: alloca [9 x i8]
+; CHECK: alloca [9 x i8]
+; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
+; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load double* %{{.*}}, align 1
+; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
+; CHECK: load i16* %{{.*}}, align 1
+; CHECK: ret void
+
+entry:
+  %a = alloca [18 x i8]
+  %raw1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+  %weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
+  %weird_cast1 = bitcast i8* %weird_gep1 to i16*
+  %weird_load1 = load i16* %weird_cast1, align 1
+
+  %raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
+  %ptr2 = bitcast i8* %raw2 to double*
+  %d1 = load double* %ptr1, align 1
+  store volatile double %d1, double* %ptr2, align 1
+  %weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
+  %weird_cast2 = bitcast i8* %weird_gep2 to i16*
+  %weird_load2 = load i16* %weird_cast2, align 1
+
+  ret void
+}
+
+define void @test6() {
+; Test that we promote alignment when the underlying alloca switches to one
+; that innately provides it.
+; CHECK: @test6
+; CHECK: alloca double
+; CHECK: alloca double
+; CHECK-NOT: align
+; CHECK: ret void
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+
+  %raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+  %val = load double* %ptr1, align 1
+  store volatile double %val, double* %ptr2, align 1
+
+  ret void
+}
+
+define void @test7(i8* %out) {
+; Test that we properly compute the destination alignment when rewriting
+; memcpys as direct loads or stores.
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  %raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i32 0, i1 false)
+; CHECK: %[[val2:.*]] = load double* %{{.*}}, align 1
+; CHECK: %[[val1:.*]] = load double* %{{.*}}, align 1
+
+  %val1 = load double* %ptr2, align 1
+  %val2 = load double* %ptr1, align 1
+
+  store double %val1, double* %ptr1, align 1
+  store double %val2, double* %ptr2, align 1
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i32 0, i1 false)
+; CHECK: store double %[[val1]], double* %{{.*}}, align 1
+; CHECK: store double %[[val2]], double* %{{.*}}, align 1
+
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
new file mode 100644
index 0000000000000..b363eefb3f9d9
--- /dev/null
+++ b/test/Transforms/SROA/basictest.ll
@@ -0,0 +1,1136 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+define i32 @test0() {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: ret i32
+
+entry:
+  %a1 = alloca i32
+  %a2 = alloca float
+
+  %a1.i8 = bitcast i32* %a1 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a1.i8)
+
+  store i32 0, i32* %a1
+  %v1 = load i32* %a1
+
+  call void @llvm.lifetime.end(i64 4, i8* %a1.i8)
+
+  %a2.i8 = bitcast float* %a2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a2.i8)
+
+  store float 0.0, float* %a2
+  %v2 = load float * %a2
+  %v2.int = bitcast float %v2 to i32
+  %sum1 = add i32 %v1, %v2.int
+
+  call void @llvm.lifetime.end(i64 4, i8* %a2.i8)
+
+  ret i32 %sum1
+}
+
+define i32 @test1() {
+; CHECK: @test1
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca { i32, float }
+  %Y = getelementptr { i32, float }* %X, i64 0, i32 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+}
+
+define i64 @test2(i64 %X) {
+; CHECK: @test2
+; CHECK-NOT: alloca
+; CHECK: ret i64 %X
+
+entry:
+  %A = alloca [8 x i8]
+  %B = bitcast [8 x i8]* %A to i64*
+  store i64 %X, i64* %B
+  br label %L2
+
+L2:
+  %Z = load i64* %B
+  ret i64 %Z
+}
+
+define void @test3(i8* %dst, i8* %src) {
+; CHECK: @test3
+
+entry:
+  %a = alloca [300 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test3_a1:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a2:.*]] = alloca [99 x i8]
+; CHECK-NEXT: %[[test3_a3:.*]] = alloca [16 x i8]
+; CHECK-NEXT: %[[test3_a4:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a6:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a7:.*]] = alloca [85 x i8]
+
+  %b = getelementptr [300 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test3_r1:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 142
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 158
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 200
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 207
+; CHECK-NEXT: %[[test3_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 208
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 215
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ; Clobber a single element of the array, this should be promotable.
+  %c = getelementptr [300 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  ; Make a sequence of overlapping stores to the array. These overlap both in
+  ; forward strides and in shrinking accesses.
+  %overlap.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 142
+  %overlap.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 143
+  %overlap.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 144
+  %overlap.4.i8 = getelementptr [300 x i8]* %a, i64 0, i64 145
+  %overlap.5.i8 = getelementptr [300 x i8]* %a, i64 0, i64 146
+  %overlap.6.i8 = getelementptr [300 x i8]* %a, i64 0, i64 147
+  %overlap.7.i8 = getelementptr [300 x i8]* %a, i64 0, i64 148
+  %overlap.8.i8 = getelementptr [300 x i8]* %a, i64 0, i64 149
+  %overlap.9.i8 = getelementptr [300 x i8]* %a, i64 0, i64 150
+  %overlap.1.i16 = bitcast i8* %overlap.1.i8 to i16*
+  %overlap.1.i32 = bitcast i8* %overlap.1.i8 to i32*
+  %overlap.1.i64 = bitcast i8* %overlap.1.i8 to i64*
+  %overlap.2.i64 = bitcast i8* %overlap.2.i8 to i64*
+  %overlap.3.i64 = bitcast i8* %overlap.3.i8 to i64*
+  %overlap.4.i64 = bitcast i8* %overlap.4.i8 to i64*
+  %overlap.5.i64 = bitcast i8* %overlap.5.i8 to i64*
+  %overlap.6.i64 = bitcast i8* %overlap.6.i8 to i64*
+  %overlap.7.i64 = bitcast i8* %overlap.7.i8 to i64*
+  %overlap.8.i64 = bitcast i8* %overlap.8.i8 to i64*
+  %overlap.9.i64 = bitcast i8* %overlap.9.i8 to i64*
+  store i8 1, i8* %overlap.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap.1.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap.1.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i64 1, i64* %overlap.1.i64
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i64*
+; CHECK-NEXT: store i64 1, i64* %[[bitcast]]
+  store i64 2, i64* %overlap.2.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 2, i64* %[[bitcast]]
+  store i64 3, i64* %overlap.3.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 3, i64* %[[bitcast]]
+  store i64 4, i64* %overlap.4.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 4, i64* %[[bitcast]]
+  store i64 5, i64* %overlap.5.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 4
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 5, i64* %[[bitcast]]
+  store i64 6, i64* %overlap.6.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 5
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 6, i64* %[[bitcast]]
+  store i64 7, i64* %overlap.7.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 6
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 7, i64* %[[bitcast]]
+  store i64 8, i64* %overlap.8.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 7
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 8, i64* %[[bitcast]]
+  store i64 9, i64* %overlap.9.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 8
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 9, i64* %[[bitcast]]
+
+  ; Make two sequences of overlapping stores with more gaps and irregularities.
+  %overlap2.1.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 200
+  %overlap2.1.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 201
+  %overlap2.1.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 202
+  %overlap2.1.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 203
+
+  %overlap2.2.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 208
+  %overlap2.2.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 209
+  %overlap2.2.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 210
+  %overlap2.2.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 211
+
+  %overlap2.1.0.i16 = bitcast i8* %overlap2.1.0.i8 to i16*
+  %overlap2.1.0.i32 = bitcast i8* %overlap2.1.0.i8 to i32*
+  %overlap2.1.1.i32 = bitcast i8* %overlap2.1.1.i8 to i32*
+  %overlap2.1.2.i32 = bitcast i8* %overlap2.1.2.i8 to i32*
+  %overlap2.1.3.i32 = bitcast i8* %overlap2.1.3.i8 to i32*
+  store i8 1,  i8*  %overlap2.1.0.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.1.0.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.1.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 2, i32* %overlap2.1.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 2, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.1.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.1.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.2.0.i32 = bitcast i8* %overlap2.2.0.i8 to i32*
+  %overlap2.2.1.i16 = bitcast i8* %overlap2.2.1.i8 to i16*
+  %overlap2.2.1.i32 = bitcast i8* %overlap2.2.1.i8 to i32*
+  %overlap2.2.2.i32 = bitcast i8* %overlap2.2.2.i8 to i32*
+  %overlap2.2.3.i32 = bitcast i8* %overlap2.2.3.i8 to i32*
+  store i32 1, i32* %overlap2.2.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a6]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i8 1,  i8*  %overlap2.2.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.2.1.i16
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.2.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.2.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.2.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.prefix = getelementptr i8* %overlap2.1.1.i8, i64 -4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.prefix, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 39
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 3
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 3
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 5
+
+  ; Bridge between the overlapping areas
+  call void @llvm.memset.p0i8.i32(i8* %overlap2.1.2.i8, i8 42, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 5
+; ...promoted i8 store...
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 2
+
+  ; Entirely within the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.1.i8, i8* %src, i32 5, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+
+  ; Trailing past the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.2.i8, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 5
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 3
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 0, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 142
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 158
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 200
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 207
+; CHECK-NEXT: store i8 42, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 208
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 215
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ret void
+}
+
+define void @test4(i8* %dst, i8* %src) {
+; CHECK: @test4
+
+entry:
+  %a = alloca [100 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test4_a1:.*]] = alloca [20 x i8]
+; CHECK-NEXT: %[[test4_a2:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a3:.*]] = alloca [10 x i8]
+; CHECK-NEXT: %[[test4_a4:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a6:.*]] = alloca [40 x i8]
+
+  %b = getelementptr [100 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r1:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 22
+; CHECK-NEXT: %[[test4_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 23
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 30
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r3:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test4_r4:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r5:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 52
+; CHECK-NEXT: %[[test4_r6:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 53
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 60
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  %a.src.1 = getelementptr [100 x i8]* %a, i64 0, i64 20
+  %a.dst.1 = getelementptr [100 x i8]* %a, i64 0, i64 40
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.1, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  ; Clobber a single element of the array, this should be promotable, and be deleted.
+  %c = getelementptr [100 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  %a.src.2 = getelementptr [100 x i8]* %a, i64 0, i64 50
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.2, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r1]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 22
+; CHECK-NEXT: store i8 %[[test4_r2]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 23
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 30
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 52
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 53
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 60
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i16 @test5() {
+; CHECK: @test5
+; CHECK-NOT: alloca float
+; CHECK:      %[[cast:.*]] = bitcast float 0.0{{.*}} to i32
+; CHECK-NEXT: %[[shr:.*]] = lshr i32 %[[cast]], 16
+; CHECK-NEXT: %[[trunc:.*]] = trunc i32 %[[shr]] to i16
+; CHECK-NEXT: ret i16 %[[trunc]]
+
+entry:
+  %a = alloca [4 x i8]
+  %fptr = bitcast [4 x i8]* %a to float*
+  store float 0.0, float* %fptr
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 2
+  %iptr = bitcast i8* %ptr to i16*
+  %val = load i16* %iptr
+  ret i16 %val
+}
+
+define i32 @test6() {
+; CHECK: @test6
+; CHECK: alloca i32
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 42, i32 4, i32 1, i1 true)
+  %iptr = bitcast i8* %ptr to i32*
+  %val = load i32* %iptr
+  ret i32 %val
+}
+
+define void @test7(i8* %src, i8* %dst) {
+; CHECK: @test7
+; CHECK: alloca i32
+; CHECK-NEXT: bitcast i8* %src to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: bitcast i8* %dst to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: ret
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+
+%S1 = type { i32, i32, [16 x i8] }
+%S2 = type { %S1*, %S2* }
+
+define %S2 @test8(%S2* %s2) {
+; CHECK: @test8
+entry:
+  %new = alloca %S2
+; CHECK-NOT: alloca
+
+  %s2.next.ptr = getelementptr %S2* %s2, i64 0, i32 1
+  %s2.next = load %S2** %s2.next.ptr
+; CHECK:      %[[gep:.*]] = getelementptr %S2* %s2, i64 0, i32 1
+; CHECK-NEXT: %[[next:.*]] = load %S2** %[[gep]]
+
+  %s2.next.s1.ptr = getelementptr %S2* %s2.next, i64 0, i32 0
+  %s2.next.s1 = load %S1** %s2.next.s1.ptr
+  %new.s1.ptr = getelementptr %S2* %new, i64 0, i32 0
+  store %S1* %s2.next.s1, %S1** %new.s1.ptr
+  %s2.next.next.ptr = getelementptr %S2* %s2.next, i64 0, i32 1
+  %s2.next.next = load %S2** %s2.next.next.ptr
+  %new.next.ptr = getelementptr %S2* %new, i64 0, i32 1
+  store %S2* %s2.next.next, %S2** %new.next.ptr
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 0
+; CHECK-NEXT: %[[next_s1:.*]] = load %S1** %[[gep]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 1
+; CHECK-NEXT: %[[next_next:.*]] = load %S2** %[[gep]]
+
+  %new.s1 = load %S1** %new.s1.ptr
+  %result1 = insertvalue %S2 undef, %S1* %new.s1, 0
+; CHECK-NEXT: %[[result1:.*]] = insertvalue %S2 undef, %S1* %[[next_s1]], 0
+  %new.next = load %S2** %new.next.ptr
+  %result2 = insertvalue %S2 %result1, %S2* %new.next, 1
+; CHECK-NEXT: %[[result2:.*]] = insertvalue %S2 %[[result1]], %S2* %[[next_next]], 1
+  ret %S2 %result2
+; CHECK-NEXT: ret %S2 %[[result2]]
+}
+
+define i64 @test9() {
+; Ensure we can handle loads off the end of an alloca even when wrapped in
+; weird bit casts and types. The result is undef, but this shouldn't crash
+; anything.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK: ret i64 undef
+
+entry:
+  %a = alloca { [3 x i8] }
+  %gep1 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 0
+  store i8 0, i8* %gep1, align 1
+  %gep2 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep2, align 1
+  %gep3 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 2
+  store i8 26, i8* %gep3, align 1
+  %cast = bitcast { [3 x i8] }* %a to { i64 }*
+  %elt = getelementptr inbounds { i64 }* %cast, i32 0, i32 0
+  %result = load i64* %elt
+  ret i64 %result
+}
+
+define %S2* @test10() {
+; CHECK: @test10
+; CHECK-NOT: alloca %S2*
+; CHECK: ret %S2* null
+
+entry:
+  %a = alloca [8 x i8]
+  %ptr = getelementptr [8 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 0, i32 8, i32 1, i1 false)
+  %s2ptrptr = bitcast i8* %ptr to %S2**
+  %s2ptr = load %S2** %s2ptrptr
+  ret %S2* %s2ptr
+}
+
+define i32 @test11() {
+; CHECK: @test11
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  %Z2 = load i32* %Y2
+  ret i32 %Z2
+}
+
+define i8 @test12() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca.
+;
+; CHECK: @test12
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i32 @test13() {
+; Ensure we don't crash and handle undefined loads that straddle the end of the
+; allocation.
+; CHECK: @test13
+; CHECK: %[[ret:.*]] = zext i16 undef to i32
+; CHECK: ret i32 %[[ret]]
+
+entry:
+  %a = alloca [3 x i8]
+  %b0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %b2ptr
+  %iptrcast = bitcast [3 x i8]* %a to i16*
+  %iptrgep = getelementptr i16* %iptrcast, i64 1
+  %i = load i16* %iptrgep
+  %ret = zext i16 %i to i32
+  ret i32 %ret
+}
+
+%test14.struct = type { [3 x i32] }
+
+define void @test14(...) nounwind uwtable {
+; This is a strange case where we split allocas into promotable partitions, but
+; also gain enough data to prove they must be dead allocas due to GEPs that walk
+; across two adjacent allocas. Test that we don't try to promote or otherwise
+; do bad things to these dead allocas, they should just be removed.
+; CHECK: @test14
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca %test14.struct
+  %p = alloca %test14.struct*
+  %0 = bitcast %test14.struct* %a to i8*
+  %1 = getelementptr i8* %0, i64 12
+  %2 = bitcast i8* %1 to %test14.struct*
+  %3 = getelementptr inbounds %test14.struct* %2, i32 0, i32 0
+  %4 = getelementptr inbounds %test14.struct* %a, i32 0, i32 0
+  %5 = bitcast [3 x i32]* %3 to i32*
+  %6 = bitcast [3 x i32]* %4 to i32*
+  %7 = load i32* %6, align 4
+  store i32 %7, i32* %5, align 4
+  %8 = getelementptr inbounds i32* %5, i32 1
+  %9 = getelementptr inbounds i32* %6, i32 1
+  %10 = load i32* %9, align 4
+  store i32 %10, i32* %8, align 4
+  %11 = getelementptr inbounds i32* %5, i32 2
+  %12 = getelementptr inbounds i32* %6, i32 2
+  %13 = load i32* %12, align 4
+  store i32 %13, i32* %11, align 4
+  ret void
+}
+
+define i32 @test15(i1 %flag) nounwind uwtable {
+; Ensure that when there are dead instructions using an alloca that are not
+; loads or stores we still delete them during partitioning and rewriting.
+; Otherwise we'll go to promote them while thy still have unpromotable uses.
+; CHECK: @test15
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK:      loop:
+; CHECK-NEXT:   br label %loop
+
+entry:
+  %l0 = alloca i64
+  %l1 = alloca i64
+  %l2 = alloca i64
+  %l3 = alloca i64
+  br label %loop
+
+loop:
+  %dead3 = phi i8* [ %gep3, %loop ], [ null, %entry ]
+
+  store i64 1879048192, i64* %l0, align 8
+  %bc0 = bitcast i64* %l0 to i8*
+  %gep0 = getelementptr i8* %bc0, i64 3
+  %dead0 = bitcast i8* %gep0 to i64*
+
+  store i64 1879048192, i64* %l1, align 8
+  %bc1 = bitcast i64* %l1 to i8*
+  %gep1 = getelementptr i8* %bc1, i64 3
+  %dead1 = getelementptr i8* %gep1, i64 1
+
+  store i64 1879048192, i64* %l2, align 8
+  %bc2 = bitcast i64* %l2 to i8*
+  %gep2.1 = getelementptr i8* %bc2, i64 1
+  %gep2.2 = getelementptr i8* %bc2, i64 3
+  ; Note that this select should get visited multiple times due to using two
+  ; different GEPs off the same alloca. We should only delete it once.
+  %dead2 = select i1 %flag, i8* %gep2.1, i8* %gep2.2
+
+  store i64 1879048192, i64* %l3, align 8
+  %bc3 = bitcast i64* %l3 to i8*
+  %gep3 = getelementptr i8* %bc3, i64 3
+
+  br label %loop
+}
+
+define void @test16(i8* %src, i8* %dst) {
+; Ensure that we can promote an alloca of [3 x i8] to an i24 SSA value.
+; CHECK: @test16
+; CHECK-NOT: alloca
+; CHECK:      %[[srccast:.*]] = bitcast i8* %src to i24*
+; CHECK-NEXT: load i24* %[[srccast]]
+; CHECK-NEXT: %[[dstcast:.*]] = bitcast i8* %dst to i24*
+; CHECK-NEXT: store i24 0, i24* %[[dstcast]]
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i24*
+  store i24 0, i24* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 false)
+  ret void
+}
+
+define void @test17(i8* %src, i8* %dst) {
+; Ensure that we can rewrite unpromotable memcpys which extend past the end of
+; the alloca.
+; CHECK: @test17
+; CHECK:      %[[a:.*]] = alloca [3 x i8]
+; CHECK-NEXT: %[[ptr:.*]] = getelementptr [3 x i8]* %[[a]], i32 0, i32 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[ptr]], i8* %src,
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[ptr]],
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+define void @test18(i8* %src, i8* %dst, i32 %size) {
+; Preserve transfer instrinsics with a variable size, even if they overlap with
+; fixed size operations. Further, continue to split and promote allocas preceding
+; the variable sized intrinsic.
+; CHECK: @test18
+; CHECK:      %[[a:.*]] = alloca [34 x i8]
+; CHECK:      %[[srcgep1:.*]] = getelementptr inbounds i8* %src, i64 4
+; CHECK-NEXT: %[[srccast1:.*]] = bitcast i8* %[[srcgep1]] to i32*
+; CHECK-NEXT: %[[srcload:.*]] = load i32* %[[srccast1]]
+; CHECK-NEXT: %[[agep1:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[agep1]], i8* %src, i32 %size,
+; CHECK-NEXT: %[[agep2:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[agep2]], i8 42, i32 %size,
+; CHECK-NEXT: %[[dstcast1:.*]] = bitcast i8* %dst to i32*
+; CHECK-NEXT: store i32 42, i32* %[[dstcast1]]
+; CHECK-NEXT: %[[dstgep1:.*]] = getelementptr inbounds i8* %dst, i64 4
+; CHECK-NEXT: %[[dstcast2:.*]] = bitcast i8* %[[dstgep1]] to i32*
+; CHECK-NEXT: store i32 %[[srcload]], i32* %[[dstcast2]]
+; CHECK-NEXT: %[[agep3:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[agep3]], i32 %size,
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [42 x i8]
+  %ptr = getelementptr [42 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 8, i32 1, i1 false)
+  %ptr2 = getelementptr [42 x i8]* %a, i32 0, i32 8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr2, i8* %src, i32 %size, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %ptr2, i8 42, i32 %size, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i32*
+  store i32 42, i32* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 8, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr2, i32 %size, i32 1, i1 false)
+  ret void
+}
+
+%opaque = type opaque
+
+define i32 @test19(%opaque* %x) {
+; This input will cause us to try to compute a natural GEP when rewriting
+; pointers in such a way that we try to GEP through the opaque type. Previously,
+; a check for an unsized type was missing and this crashed. Ensure it behaves
+; reasonably now.
+; CHECK: @test19
+; CHECK-NOT: alloca
+; CHECK: ret i32 undef
+
+entry:
+  %a = alloca { i64, i8* }
+  %cast1 = bitcast %opaque* %x to i8*
+  %cast2 = bitcast { i64, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast2, i8* %cast1, i32 16, i32 1, i1 false)
+  %gep = getelementptr inbounds { i64, i8* }* %a, i32 0, i32 0
+  %val = load i64* %gep
+  ret i32 undef
+}
+
+define i32 @test20() {
+; Ensure we can track negative offsets (before the beginning of the alloca) and
+; negative relative offsets from offsets starting past the end of the alloca.
+; CHECK: @test20
+; CHECK-NOT: alloca
+; CHECK: %[[sum1:.*]] = add i32 1, 2
+; CHECK: %[[sum2:.*]] = add i32 %[[sum1]], 3
+; CHECK: ret i32 %[[sum2]]
+
+entry:
+  %a = alloca [3 x i32]
+  %gep1 = getelementptr [3 x i32]* %a, i32 0, i32 0
+  store i32 1, i32* %gep1
+  %gep2.1 = getelementptr [3 x i32]* %a, i32 0, i32 -2
+  %gep2.2 = getelementptr i32* %gep2.1, i32 3
+  store i32 2, i32* %gep2.2
+  %gep3.1 = getelementptr [3 x i32]* %a, i32 0, i32 14
+  %gep3.2 = getelementptr i32* %gep3.1, i32 -12
+  store i32 3, i32* %gep3.2
+
+  %load1 = load i32* %gep1
+  %load2 = load i32* %gep2.2
+  %load3 = load i32* %gep3.2
+  %sum1 = add i32 %load1, %load2
+  %sum2 = add i32 %sum1, %load3
+  ret i32 %sum2
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define i8 @test21() {
+; Test allocations and offsets which border on overflow of the int64_t used
+; internally. This is really awkward to really test as LLVM doesn't really
+; support such extreme constructs cleanly.
+; CHECK: @test21
+; CHECK-NOT: alloca
+; CHECK: or i8 -1, -1
+
+entry:
+  %a = alloca [2305843009213693951 x i8]
+  %gep0 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 2305843009213693949
+  store i8 255, i8* %gep0
+  %gep1 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 -9223372036854775807
+  %gep2 = getelementptr i8* %gep1, i64 -1
+  call void @llvm.memset.p0i8.i64(i8* %gep2, i8 0, i64 18446744073709551615, i32 1, i1 false)
+  %gep3 = getelementptr i8* %gep1, i64 9223372036854775807
+  %gep4 = getelementptr i8* %gep3, i64 9223372036854775807
+  %gep5 = getelementptr i8* %gep4, i64 -6917529027641081857
+  store i8 255, i8* %gep5
+  %cast1 = bitcast i8* %gep4 to i32*
+  store i32 0, i32* %cast1
+  %load = load i8* %gep0
+  %gep6 = getelementptr i8* %gep0, i32 1
+  %load2 = load i8* %gep6
+  %result = or i8 %load, %load2
+  ret i8 %result
+}
+
+%PR13916.struct = type { i8 }
+
+define void @PR13916.1() {
+; Ensure that we handle overlapping memcpy intrinsics correctly, especially in
+; the case where there is a directly identical value for both source and dest.
+; CHECK: @PR13916.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %a = alloca i8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %a, i32 1, i32 1, i1 false)
+  %tmp2 = load i8* %a
+  ret void
+}
+
+define void @PR13916.2() {
+; Check whether we continue to handle them correctly when they start off with
+; different pointer value chains, but during rewriting we coalesce them into the
+; same value.
+; CHECK: @PR13916.2
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %a = alloca %PR13916.struct, align 1
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %tmp0 = bitcast %PR13916.struct* %a to i8*
+  %tmp1 = bitcast %PR13916.struct* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp0, i8* %tmp1, i32 1, i32 1, i1 false)
+  br label %if.end
+
+if.end:
+  %gep = getelementptr %PR13916.struct* %a, i32 0, i32 0
+  %tmp2 = load i8* %gep
+  ret void
+}
+
+define void @PR13990() {
+; Ensure we can handle cases where processing one alloca causes the other
+; alloca to become dead and get deleted. This might crash or fail under
+; Valgrind if we regress.
+; CHECK: @PR13990
+; CHECK-NOT: alloca
+; CHECK: unreachable
+; CHECK: unreachable
+
+entry:
+  %tmp1 = alloca i8*
+  %tmp2 = alloca i8*
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store i8* undef, i8** %tmp2
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  %tmp50 = select i1 undef, i8** %tmp2, i8** %tmp1
+  br i1 undef, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+}
+
+define double @PR13969(double %x) {
+; Check that we detect when promotion will un-escape an alloca and iterate to
+; re-try running SROA over that alloca. Without that, the two allocas that are
+; stored into a dead alloca don't get rewritten and promoted.
+; CHECK: @PR13969
+
+entry:
+  %a = alloca double
+  %b = alloca double*
+  %c = alloca double
+; CHECK-NOT: alloca
+
+  store double %x, double* %a
+  store double* %c, double** %b
+  store double* %a, double** %b
+  store double %x, double* %c
+  %ret = load double* %a
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+  ret double %ret
+; CHECK: ret double %x
+}
+
+%PR14034.struct = type { { {} }, i32, %PR14034.list }
+%PR14034.list = type { %PR14034.list*, %PR14034.list* }
+
+define void @PR14034() {
+; This test case tries to form GEPs into the empty leading struct members, and
+; subsequently crashed (under valgrind) before we fixed the PR. The important
+; thing is to handle empty structs gracefully.
+; CHECK: @PR14034
+
+entry:
+  %a = alloca %PR14034.struct
+  %list = getelementptr %PR14034.struct* %a, i32 0, i32 2
+  %prev = getelementptr %PR14034.list* %list, i32 0, i32 1
+  store %PR14034.list* undef, %PR14034.list** %prev
+  %cast0 = bitcast %PR14034.struct* undef to i8*
+  %cast1 = bitcast %PR14034.struct* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast0, i8* %cast1, i32 12, i32 0, i1 false)
+  ret void
+}
+
+define i32 @test22(i32 %x) {
+; Test that SROA and promotion is not confused by a grab bax mixture of pointer
+; types involving wrapper aggregates and zero-length aggregate members.
+; CHECK: @test22
+
+entry:
+  %a1 = alloca { { [1 x { i32 }] } }
+  %a2 = alloca { {}, { float }, [0 x i8] }
+  %a3 = alloca { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }
+; CHECK-NOT: alloca
+
+  %wrap1 = insertvalue [1 x { i32 }] undef, i32 %x, 0, 0
+  %gep1 = getelementptr { { [1 x { i32 }] } }* %a1, i32 0, i32 0, i32 0
+  store [1 x { i32 }] %wrap1, [1 x { i32 }]* %gep1
+
+  %gep2 = getelementptr { { [1 x { i32 }] } }* %a1, i32 0, i32 0
+  %ptrcast1 = bitcast { [1 x { i32 }] }* %gep2 to { [1 x { float }] }*
+  %load1 = load { [1 x { float }] }* %ptrcast1
+  %unwrap1 = extractvalue { [1 x { float }] } %load1, 0, 0
+
+  %wrap2 = insertvalue { {}, { float }, [0 x i8] } undef, { float } %unwrap1, 1
+  store { {}, { float }, [0 x i8] } %wrap2, { {}, { float }, [0 x i8] }* %a2
+
+  %gep3 = getelementptr { {}, { float }, [0 x i8] }* %a2, i32 0, i32 1, i32 0
+  %ptrcast2 = bitcast float* %gep3 to <4 x i8>*
+  %load3 = load <4 x i8>* %ptrcast2
+  %valcast1 = bitcast <4 x i8> %load3 to i32
+
+  %wrap3 = insertvalue [1 x [1 x i32]] undef, i32 %valcast1, 0, 0
+  %wrap4 = insertvalue { [1 x [1 x i32]], {} } undef, [1 x [1 x i32]] %wrap3, 0
+  %gep4 = getelementptr { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }* %a3, i32 0, i32 1
+  %ptrcast3 = bitcast { [0 x double], [1 x [1 x <4 x i8>]], {} }* %gep4 to { [1 x [1 x i32]], {} }*
+  store { [1 x [1 x i32]], {} } %wrap4, { [1 x [1 x i32]], {} }* %ptrcast3
+
+  %gep5 = getelementptr { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }* %a3, i32 0, i32 1, i32 1, i32 0
+  %ptrcast4 = bitcast [1 x <4 x i8>]* %gep5 to { {}, float, {} }*
+  %load4 = load { {}, float, {} }* %ptrcast4
+  %unwrap2 = extractvalue { {}, float, {} } %load4, 1
+  %valcast2 = bitcast float %unwrap2 to i32
+
+  ret i32 %valcast2
+; CHECK: ret i32
+}
+
+define void @PR14059.1(double* %d) {
+; In PR14059 a peculiar construct was identified as something that is used
+; pervasively in ARM's ABI-calling-convention lowering: the passing of a struct
+; of doubles via an array of i32 in order to place the data into integer
+; registers. This in turn was missed as an optimization by SROA due to the
+; partial loads and stores of integers to the double alloca we were trying to
+; form and promote. The solution is to widen the integer operations to be
+; whole-alloca operations, and perform the appropriate bitcasting on the
+; *values* rather than the pointers. When this works, partial reads and writes
+; via integers can be promoted away.
+; CHECK: @PR14059.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %X.sroa.0.i = alloca double, align 8
+  %0 = bitcast double* %X.sroa.0.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0)
+
+  ; Store to the low 32-bits...
+  %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32*
+  store i32 0, i32* %X.sroa.0.0.cast2.i, align 8
+
+  ; Also use a memset to the middle 32-bits for fun.
+  %X.sroa.0.2.raw_idx2.i = getelementptr inbounds i8* %0, i32 2
+  call void @llvm.memset.p0i8.i64(i8* %X.sroa.0.2.raw_idx2.i, i8 0, i64 4, i32 1, i1 false)
+
+  ; Or a memset of the whole thing.
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 8, i32 1, i1 false)
+
+  ; Write to the high 32-bits with a memcpy.
+  %X.sroa.0.4.raw_idx4.i = getelementptr inbounds i8* %0, i32 4
+  %d.raw = bitcast double* %d to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %X.sroa.0.4.raw_idx4.i, i8* %d.raw, i32 4, i32 1, i1 false)
+
+  ; Store to the high 32-bits...
+  %X.sroa.0.4.cast5.i = bitcast i8* %X.sroa.0.4.raw_idx4.i to i32*
+  store i32 1072693248, i32* %X.sroa.0.4.cast5.i, align 4
+
+  ; Do the actual math...
+  %X.sroa.0.0.load1.i = load double* %X.sroa.0.i, align 8
+  %accum.real.i = load double* %d, align 8
+  %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i
+  store double %add.r.i, double* %d, align 8
+  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  ret void
+}
+
+define i64 @PR14059.2({ float, float }* %phi) {
+; Check that SROA can split up alloca-wide integer loads and stores where the
+; underlying alloca has smaller components that are accessed independently. This
+; shows up particularly with ABI lowering patterns coming out of Clang that rely
+; on the particular register placement of a single large integer return value.
+; CHECK: @PR14059.2
+
+entry:
+  %retval = alloca { float, float }, align 4
+  ; CHECK-NOT: alloca
+
+  %0 = bitcast { float, float }* %retval to i64*
+  store i64 0, i64* %0
+  ; CHECK-NOT: store
+
+  %phi.realp = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  %phi.real = load float* %phi.realp
+  %phi.imagp = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  %phi.imag = load float* %phi.imagp
+  ; CHECK:      %[[realp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  ; CHECK-NEXT: %[[real:.*]] = load float* %[[realp]]
+  ; CHECK-NEXT: %[[imagp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  ; CHECK-NEXT: %[[imag:.*]] = load float* %[[imagp]]
+
+  %real = getelementptr inbounds { float, float }* %retval, i32 0, i32 0
+  %imag = getelementptr inbounds { float, float }* %retval, i32 0, i32 1
+  store float %phi.real, float* %real
+  store float %phi.imag, float* %imag
+  ; CHECK-NEXT: %[[real_convert:.*]] = bitcast float %[[real]] to i32
+  ; CHECK-NEXT: %[[imag_convert:.*]] = bitcast float %[[imag]] to i32
+  ; CHECK-NEXT: %[[imag_ext:.*]] = zext i32 %[[imag_convert]] to i64
+  ; CHECK-NEXT: %[[imag_shift:.*]] = shl i64 %[[imag_ext]], 32
+  ; CHECK-NEXT: %[[imag_mask:.*]] = and i64 undef, 4294967295
+  ; CHECK-NEXT: %[[imag_insert:.*]] = or i64 %[[imag_mask]], %[[imag_shift]]
+  ; CHECK-NEXT: %[[real_ext:.*]] = zext i32 %[[real_convert]] to i64
+  ; CHECK-NEXT: %[[real_mask:.*]] = and i64 %[[imag_insert]], -4294967296
+  ; CHECK-NEXT: %[[real_insert:.*]] = or i64 %[[real_mask]], %[[real_ext]]
+
+  %1 = load i64* %0, align 1
+  ret i64 %1
+  ; CHECK-NEXT: ret i64 %[[real_insert]]
+}
+
+define void @PR14105({ [16 x i8] }* %ptr) {
+; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is
+; sign as negative. We use a volatile memcpy to ensure promotion never actually
+; occurs.
+; CHECK: @PR14105
+
+entry:
+  %a = alloca { [16 x i8] }, align 8
+; CHECK: alloca [16 x i8], align 8
+
+  %gep = getelementptr inbounds { [16 x i8] }* %ptr, i64 -1
+; CHECK-NEXT: getelementptr inbounds { [16 x i8] }* %ptr, i64 -1, i32 0, i64 0
+
+  %cast1 = bitcast { [16 x i8 ] }* %gep to i8*
+  %cast2 = bitcast { [16 x i8 ] }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast1, i8* %cast2, i32 16, i32 8, i1 true)
+  ret void
+; CHECK: ret
+}
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
new file mode 100644
index 0000000000000..ce82d1f30b57c
--- /dev/null
+++ b/test/Transforms/SROA/big-endian.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i8 @test1() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca. Note this is
+; the same as test12 in basictest.ll, but here we assert big-endian byte
+; ordering.
+;
+; CHECK: @test1
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i64 @test2() {
+; Test for various mixed sizes of integer loads and stores all getting
+; promoted.
+;
+; CHECK: @test2
+
+entry:
+  %a = alloca [7 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [7 x i8]* %a, i64 0, i32 0
+  %a1ptr = getelementptr [7 x i8]* %a, i64 0, i32 1
+  %a2ptr = getelementptr [7 x i8]* %a, i64 0, i32 2
+  %a3ptr = getelementptr [7 x i8]* %a, i64 0, i32 3
+
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+
+  %a0i16ptr = bitcast i8* %a0ptr to i16*
+  store i16 1, i16* %a0i16ptr
+; CHECK:      %[[mask0:.*]] = and i16 1, -16
+
+  %a1i4ptr = bitcast i8* %a1ptr to i4*
+  store i4 1, i4* %a1i4ptr
+; CHECK-NEXT: %[[insert0:.*]] = or i16 %[[mask0]], 1
+
+  store i8 1, i8* %a2ptr
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, 4294967295
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], 4294967296
+
+  %a3i24ptr = bitcast i8* %a3ptr to i24*
+  store i24 1, i24* %a3i24ptr
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], 256
+
+  %a2i40ptr = bitcast i8* %a2ptr to i40*
+  store i40 1, i40* %a2i40ptr
+; CHECK-NEXT: %[[ext3:.*]] = zext i40 1 to i56
+; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
+
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+
+  %aiptr = bitcast [7 x i8]* %a to i56*
+  %ai = load i56* %aiptr
+  %ret = zext i56 %ai to i64
+  ret i64 %ret
+; CHECK-NEXT: %[[ext4:.*]] = zext i16 %[[insert0]] to i56
+; CHECK-NEXT: %[[shift4:.*]] = shl i56 %[[ext4]], 40
+; CHECK-NEXT: %[[mask4:.*]] = and i56 %[[insert3]], 1099511627775
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[shift4]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64
+; CHECK-NEXT: ret i64 %[[ret]]
+}
diff --git a/test/Transforms/SROA/fca.ll b/test/Transforms/SROA/fca.ll
new file mode 100644
index 0000000000000..c30a5cc974fc7
--- /dev/null
+++ b/test/Transforms/SROA/fca.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define { i32, i32 } @test0(i32 %x, i32 %y) {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+
+  store { i32, i32 } undef, { i32, i32 }* %a
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load { i32, i32 }* %a
+  ret { i32, i32 } %result
+}
+
+define { i32, i32 } @test1(i32 %x, i32 %y) {
+; FIXME: This may be too conservative. Duncan argues that we are allowed to
+; split the volatile load and store here but must produce volatile scalar loads
+; and stores from them.
+; CHECK: @test1
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+  %b = alloca { i32, i32 }
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load volatile { i32, i32 }* %a
+  store volatile { i32, i32 } %result, { i32, i32 }* %b
+  ret { i32, i32 } %result
+}
diff --git a/test/Transforms/SROA/lit.local.cfg b/test/Transforms/SROA/lit.local.cfg
new file mode 100644
index 0000000000000..c6106e4746f2d
--- /dev/null
+++ b/test/Transforms/SROA/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
new file mode 100644
index 0000000000000..921016a9c24b3
--- /dev/null
+++ b/test/Transforms/SROA/phi-and-select.ll
@@ -0,0 +1,427 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i32 @test1() {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	br i1 %cond, label %then, label %exit
+
+then:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %then ], [ %a0, %entry ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test2() {
+; CHECK: @test2
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a1, i32* %a0
+; CHECK: select i1 %{{.*}}, i32 1, i32 0
+
+	%result = load i32* %select
+	ret i32 %result
+}
+
+define i32 @test3(i32 %x) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  ; Note that we build redundant GEPs here to ensure that having different GEPs
+  ; into the same alloca partation continues to work with PHI speculation. This
+  ; was the underlying cause of PR13926.
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a0b = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+  %a1b = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+  switch i32 %x, label %bb0 [ i32 1, label %bb1
+                              i32 2, label %bb2
+                              i32 3, label %bb3
+                              i32 4, label %bb4
+                              i32 5, label %bb5
+                              i32 6, label %bb6
+                              i32 7, label %bb7 ]
+
+bb0:
+	br label %exit
+bb1:
+	br label %exit
+bb2:
+	br label %exit
+bb3:
+	br label %exit
+bb4:
+	br label %exit
+bb5:
+	br label %exit
+bb6:
+	br label %exit
+bb7:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %bb0 ], [ %a0, %bb1 ], [ %a0, %bb2 ], [ %a1, %bb3 ],
+                  [ %a1b, %bb4 ], [ %a0b, %bb5 ], [ %a0b, %bb6 ], [ %a1b, %bb7 ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ], [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test4() {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a0, i32* %a0
+; CHECK-NOT: select
+
+	%result = load i32* %select
+	ret i32 %result
+; CHECK: ret i32 0
+}
+
+define i32 @test5(i32* %b) {
+; CHECK: @test5
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+	%select = select i1 true, i32* %a1, i32* %b
+; CHECK-NOT: select
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+declare void @f(i32*, i32*)
+
+define i32 @test6(i32* %b) {
+; CHECK: @test6
+entry:
+	%a = alloca [2 x i32]
+  %c = alloca i32
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+
+	%select = select i1 true, i32* %a1, i32* %b
+	%select2 = select i1 false, i32* %a1, i32* %b
+  %select3 = select i1 false, i32* %c, i32* %b
+; CHECK: %[[select2:.*]] = select i1 false, i32* undef, i32* %b
+; CHECK: %[[select3:.*]] = select i1 false, i32* undef, i32* %b
+
+  ; Note, this would potentially escape the alloca pointer except for the
+  ; constant folding of the select.
+  call void @f(i32* %select2, i32* %select3)
+; CHECK: call void @f(i32* %[[select2]], i32* %[[select3]])
+
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+  %dead = load i32* %c
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+define i32 @test7() {
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y1 = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y1
+  br label %exit
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  br label %exit
+
+exit:
+	%P = phi i32* [ %Y1, %good ], [ %Y2, %bad ]
+; CHECK: %[[phi:.*]] = phi i32 [ 0, %good ],
+  %Z2 = load i32* %P
+  ret i32 %Z2
+; CHECK: ret i32 %[[phi]]
+}
+
+define i32 @test8(i32 %b, i32* %ptr) {
+; Ensure that we rewrite allocas to the used type when that use is hidden by
+; a PHI that can be speculated.
+; CHECK: @test8
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i32 [ undef, %else ], [ %[[value]], %then ]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast float* %f to i32*
+  br label %exit
+
+exit:
+  %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load i32* %phi, align 4
+  ret i32 %loaded
+}
+
+define i32 @test9(i32 %b, i32* %ptr) {
+; Same as @test8 but for a select rather than a PHI node.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 undef, i32 %[[value]]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  store i32 0, i32* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast float* %f to i32*
+  %select = select i1 %test, i32* %bitcast, i32* %ptr
+  %loaded = load i32* %select, align 4
+  ret i32 %loaded
+}
+
+define float @test10(i32 %b, float* %ptr) {
+; Don't try to promote allocas which are not elligible for it even after
+; rewriting due to the necessity of inserting bitcasts when speculating a PHI
+; node.
+; CHECK: @test10
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[result:.*]] = phi float [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ]
+; CHECK-NEXT: ret float %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast double* %f to float*
+  br label %exit
+
+exit:
+  %phi = phi float* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load float* %phi, align 4
+  ret float %loaded
+}
+
+define float @test11(i32 %b, float* %ptr) {
+; Same as @test10 but for a select rather than a PHI node.
+; CHECK: @test11
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, float %[[allocavalue]], float %[[argvalue]]
+; CHECK-NEXT: ret float %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  store float 0.0, float* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast double* %f to float*
+  %select = select i1 %test, float* %bitcast, float* %ptr
+  %loaded = load float* %select, align 4
+  ret float %loaded
+}
+
+define i32 @test12(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead selects of allocas if no load is
+; never found.
+; CHECK: @test12
+; CHECK-NOT: alloca
+; CHECK-NOT: select
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  %dead = select i1 undef, i32* %a, i32* %p
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @test13(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead phis of allocas if no load is ever
+; found.
+; CHECK: @test13
+; CHECK-NOT: alloca
+; CHECK-NOT: phi
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  br label %loop
+
+loop:
+  %phi = phi i32* [ %p, %entry ], [ %a, %loop ]
+  br i1 undef, label %loop, label %exit
+
+exit:
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @PR13905() {
+; Check a pattern where we have a chain of dead phi nodes to ensure they are
+; deleted and promotion can proceed.
+; CHECK: @PR13905
+; CHECK-NOT: alloca i32
+; CHECK: ret i32 undef
+
+entry:
+  %h = alloca i32
+  store i32 0, i32* %h
+  br i1 undef, label %loop1, label %exit
+
+loop1:
+  %phi1 = phi i32* [ null, %entry ], [ %h, %loop1 ], [ %h, %loop2 ]
+  br i1 undef, label %loop1, label %loop2
+
+loop2:
+  br i1 undef, label %loop1, label %exit
+
+exit:
+  %phi2 = phi i32* [ %phi1, %loop2 ], [ null, %entry ]
+  ret i32 undef
+}
+
+define i32 @PR13906() {
+; Another pattern which can lead to crashes due to failing to clear out dead
+; PHI nodes or select nodes. This triggers subtly differently from the above
+; cases because the PHI node is (recursively) alive, but the select is dead.
+; CHECK: @PR13906
+; CHECK-NOT: alloca
+
+entry:
+  %c = alloca i32
+  store i32 0, i32* %c
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32* [ undef, %entry ], [ %c, %if.then ], [ %d.0, %for.cond ]
+  br i1 undef, label %if.then, label %for.cond
+
+if.then:
+  %tmpcast.d.0 = select i1 undef, i32* %c, i32* %d.0
+  br label %for.cond
+}
+
+define i64 @PR14132(i1 %flag) {
+; CHECK: @PR14132
+; Here we form a PHI-node by promoting the pointer alloca first, and then in
+; order to promote the other two allocas, we speculate the load of the
+; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
+; alloca, which is completely bogus. However, we were asserting on trying to
+; rewrite it. Now it is replaced with undef. Eventually we may replace it with
+; unrechable and even the CFG will go away here.
+entry:
+  %a = alloca i64
+  %b = alloca i8
+  %ptr = alloca i64*
+; CHECK-NOT: alloca
+
+  %ptr.cast = bitcast i64** %ptr to i8**
+  store i64 0, i64* %a
+  store i8 1, i8* %b
+  store i64* %a, i64** %ptr
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  store i8* %b, i8** %ptr.cast
+  br label %if.end
+
+if.end:
+  %tmp = load i64** %ptr
+  %result = load i64* %tmp
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i64 [ undef, %if.then ], [ 0, %entry ]
+
+  ret i64 %result
+; CHECK-NEXT: ret i64 %[[result]]
+}
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
new file mode 100644
index 0000000000000..ea28f5d1a647a
--- /dev/null
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -0,0 +1,267 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+%S1 = type { i64, [42 x float] }
+
+define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      extractelement <4 x i32> %x, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test2
+; FIXME: This should be handled!
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK: alloca <4 x i32>
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
+  %tmp3.vec = load <2 x i32>* %a.tmp3.cast
+  %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.y.cast, i8 0, i32 16, i32 1, i1 false)
+; CHECK-NOT: memset
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.tmp1.cast, i8 -1, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      %[[insert:.*]] = insertelement <4 x i32> %x, i32 -1, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 3
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test4(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %z.cast = bitcast <4 x i32>* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.y.cast, i8* %z.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.tmp1.cast, i8* %z.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[load:.*]] = load <4 x i32>* %z
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[element_load:.*]] = load i32* %[[gep]]
+; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test5(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test5
+; The same as the above, but with reversed source and destination for the
+; element memcpy, and a self copy.
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %a.x.cast = bitcast <4 x i32>* %a.x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.x.cast, i8* %a.y.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %z.tmp1.cast, i8* %a.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[extract:.*]] = extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: store i32 %[[extract]], i32* %[[gep]]
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
+; CHECK: @test6
+; The old scalarrepl pass would wrongly drop the store to the second alloca.
+; PR13254
+  %tmp = alloca { <4 x i64>, <4 x i64> }
+  %p0 = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0
+  store <4 x i64> %x, <4 x i64>* %p0
+; CHECK: store <4 x i64> %x,
+  %p1 = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 1
+  store <4 x i64> %y, <4 x i64>* %p1
+; CHECK: store <4 x i64> %y,
+  %addr = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0, i64 %n
+  %res = load i64* %addr, align 4
+  ret i64 %res
+}
+
+define i32 @PR14212() {
+; CHECK: @PR14212
+; This caused a crash when "splitting" the load of the i32 in order to promote
+; the store of <3 x i8> properly. Heavily reduced from an OpenCL test case.
+entry:
+  %retval = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  store <3 x i8> undef, <3 x i8>* %retval, align 4
+  %cast = bitcast <3 x i8>* %retval to i32*
+  %load = load i32* %cast, align 4
+  ret i32 %load
+; CHECK: ret i32
+}
+
+define <2 x i8> @PR14349.1(i32 %x) {
+; CEHCK: @PR14349.1
+; The first testcase for broken SROA rewriting of split integer loads and
+; stores due to smaller vector loads and stores. This particular test ensures
+; that we can rewrite a split store of an integer to a store of a vector.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  store i32 %x, i32* %a
+; CHECK-NOT: store
+
+  %cast = bitcast i32* %a to <2 x i8>*
+  %vec = load <2 x i8>* %cast
+; CHECK-NOT: load
+
+  ret <2 x i8> %vec
+; CHECK: %[[trunc:.*]] = trunc i32 %x to i16
+; CHECK: %[[cast:.*]] = bitcast i16 %[[trunc]] to <2 x i8>
+; CHECK: ret <2 x i8> %[[cast]]
+}
+
+define i32 @PR14349.2(<2 x i8> %x) {
+; CEHCK: @PR14349.2
+; The first testcase for broken SROA rewriting of split integer loads and
+; stores due to smaller vector loads and stores. This particular test ensures
+; that we can rewrite a split load of an integer to a load of a vector.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  %cast = bitcast i32* %a to <2 x i8>*
+  store <2 x i8> %x, <2 x i8>* %cast
+; CHECK-NOT: store
+
+  %int = load i32* %a
+; CHECK-NOT: load
+
+  ret i32 %int
+; CHECK: %[[cast:.*]] = bitcast <2 x i8> %x to i16
+; CHECK: %[[trunc:.*]] = zext i16 %[[cast]] to i32
+; CHECK: %[[insert:.*]] = or i32 %{{.*}}, %[[trunc]]
+; CHECK: ret i32 %[[insert]]
+}
diff --git a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
new file mode 100644
index 0000000000000..786fee9e6610b
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Sparc' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll
new file mode 100644
index 0000000000000..9d1568557f30a
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -simplifycfg -S -mtriple=sparc-unknown-unknown | FileCheck %s
+
+; Check that switches are not turned into lookup tables, as this is not
+; considered profitable on the target.
+
+define i32 @f(i32 %c) nounwind uwtable readnone {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK-NOT: getelementptr
+; CHECK: switch i32 %c
+}
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
new file mode 100644
index 0000000000000..a8ad0f1a28b23
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
new file mode 100644
index 0000000000000..8a59992f5e64e
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -0,0 +1,779 @@
+; RUN: opt < %s -simplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The table for @f
+; CHECK: @switch.table = private unnamed_addr constant [7 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 27, i32 62, i32 1]
+
+; The float table for @h
+; CHECK: @switch.table1 = private unnamed_addr constant [4 x float] [float 0x40091EB860000000, float 0x3FF3BE76C0000000, float 0x4012449BA0000000, float 0x4001AE1480000000]
+
+; The table for @foostring
+; CHECK: @switch.table2 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
+
+; The table for @earlyreturncrash
+; CHECK: @switch.table3 = private unnamed_addr constant [4 x i32] [i32 42, i32 9, i32 88, i32 5]
+
+; The table for @large.
+; CHECK: @switch.table4 = private unnamed_addr constant [199 x i32] [i32 1, i32 4, i32 9,
+
+; The table for @cprop
+; CHECK: @switch.table5 = private unnamed_addr constant [7 x i32] [i32 5, i32 42, i32 126, i32 -452, i32 128, i32 6, i32 7]
+
+; The table for @unreachable
+; CHECK: @switch.table6 = private unnamed_addr constant [5 x i32] [i32 0, i32 0, i32 0, i32 1, i32 -1]
+
+; A simple int-to-int selection switch.
+; It is dense enough to be replaced by table lookup.
+; The result is directly by a ret from an otherwise empty bb,
+; so we return early, directly from the lookup bb.
+
+define i32 @f(i32 %c) {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, 42
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 7
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: return:
+; CHECK-NEXT: ret i32 15
+}
+
+; A switch used to initialize two variables, an i8 and a float.
+
+declare void @dummy(i8 signext, float)
+define void @h(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i8 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi float [ 0x4023FAE140000000, %sw.default ], [ 0x4001AE1480000000, %sw.bb3 ], [ 0x4012449BA0000000, %sw.bb2 ], [ 0x3FF3BE76C0000000, %sw.bb1 ], [ 0x40091EB860000000, %entry ]
+  call void @dummy(i8 signext %a.0, float %b.0)
+  ret void
+
+; CHECK: @h
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %sw.epilog
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.shiftamt = mul i32 %switch.tableidx, 8
+; CHECK-NEXT: %switch.downshift = lshr i32 89655594, %switch.shiftamt
+; CHECK-NEXT: %switch.masked = trunc i32 %switch.downshift to i8
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x float]* @switch.table1, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load float* %switch.gep
+; CHECK-NEXT: br label %sw.epilog
+; CHECK: sw.epilog:
+; CHECK-NEXT: %a.0 = phi i8 [ %switch.masked, %switch.lookup ], [ 7, %entry ]
+; CHECK-NEXT: %b.0 = phi float [ %switch.load, %switch.lookup ], [ 0x4023FAE140000000, %entry ]
+; CHECK-NEXT: call void @dummy(i8 signext %a.0, float %b.0)
+; CHECK-NEXT: ret void
+}
+
+
+; Switch used to return a string.
+
+@.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.str1 = private unnamed_addr constant [4 x i8] c"bar\00", align 1
+@.str2 = private unnamed_addr constant [4 x i8] c"baz\00", align 1
+@.str3 = private unnamed_addr constant [4 x i8] c"qux\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"error\00", align 1
+
+define i8* @foostring(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %return
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+
+return:
+  %retval.0 = phi i8* [ getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0), %sw.default ],
+                      [ getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0), %sw.bb3 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), %sw.bb2 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), %sw.bb1 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), %entry ]
+  ret i8* %retval.0
+
+; CHECK: @foostring
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i8*]* @switch.table2, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i8** %switch.gep
+; CHECK-NEXT: ret i8* %switch.load
+}
+
+; Switch used to initialize two values. The first value is returned, the second
+; value is not used. This used to make the transformation generate illegal code.
+
+define i32 @earlyreturncrash(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i32 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi i32 [ 10, %sw.default ], [ 5, %sw.bb3 ], [ 1, %sw.bb2 ], [ 4, %sw.bb1 ], [ 3, %entry ]
+  ret i32 %a.0
+
+; CHECK: @earlyreturncrash
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i32]* @switch.table3, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: sw.epilog:
+; CHECK-NEXT: ret i32 7
+}
+
+
+; Example 7 from http://blog.regehr.org/archives/320
+; It is not dense enough for a regular table, but the results
+; can be packed into a bitmap.
+
+define i32 @crud(i8 zeroext %c)  {
+entry:
+  %cmp = icmp ult i8 %c, 33
+  br i1 %cmp, label %lor.end, label %switch.early.test
+
+switch.early.test:
+  switch i8 %c, label %lor.rhs [
+    i8 92, label %lor.end
+    i8 62, label %lor.end
+    i8 60, label %lor.end
+    i8 59, label %lor.end
+    i8 58, label %lor.end
+    i8 46, label %lor.end
+    i8 44, label %lor.end
+    i8 34, label %lor.end
+    i8 39, label %switch.edge
+  ]
+
+switch.edge: br label %lor.end
+lor.rhs: br label %lor.end
+
+lor.end:
+  %0 = phi i1 [ true, %switch.early.test ],
+              [ false, %lor.rhs ],
+              [ true, %entry ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.edge ]
+  %lor.ext = zext i1 %0 to i32
+  ret i32 %lor.ext
+
+; CHECK: @crud
+; CHECK: entry:
+; CHECK-NEXT: %cmp = icmp ult i8 %c, 33
+; CHECK-NEXT: br i1 %cmp, label %lor.end, label %switch.early.test
+; CHECK: switch.early.test:
+; CHECK-NEXT: %switch.tableidx = sub i8 %c, 34
+; CHECK-NEXT: %0 = icmp ult i8 %switch.tableidx, 59
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %lor.end
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.cast = zext i8 %switch.tableidx to i59
+; CHECK-NEXT: %switch.shiftamt = mul i59 %switch.cast, 1
+; CHECK-NEXT: %switch.downshift = lshr i59 -288230375765830623, %switch.shiftamt
+; CHECK-NEXT: %switch.masked = trunc i59 %switch.downshift to i1
+; CHECK-NEXT: br label %lor.end
+; CHECK: lor.end:
+; CHECK-NEXT: %1 = phi i1 [ true, %entry ], [ %switch.masked, %switch.lookup ], [ false, %switch.early.test ]
+; CHECK-NEXT: %lor.ext = zext i1 %1 to i32
+; CHECK-NEXT: ret i32 %lor.ext
+}
+
+; PR13946
+define i32 @overflow(i32 %type) {
+entry:
+  switch i32 %type, label %sw.default [
+    i32 -2147483648, label %sw.bb
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 -2147483645, label %sw.bb3
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb: br label %if.end
+sw.bb1: br label %if.end
+sw.bb2: br label %if.end
+sw.bb3: br label %if.end
+sw.default: br label %if.end
+if.else: br label %if.end
+
+if.end:
+  %dirent_type.0 = phi i32 [ 3, %sw.default ], [ 6, %sw.bb3 ], [ 5, %sw.bb2 ], [ 0, %sw.bb1 ], [ 3, %sw.bb ], [ 0, %if.else ]
+  ret i32 %dirent_type.0
+; CHECK: define i32 @overflow
+; CHECK: switch
+; CHECK: phi
+}
+
+; PR13985
+define i1 @undef(i32 %tmp) {
+bb:
+  switch i32 %tmp, label %bb3 [
+    i32 0, label %bb1
+    i32 1, label %bb1
+    i32 7, label %bb2
+    i32 8, label %bb2
+  ]
+
+bb1: br label %bb3
+bb2: br label %bb3
+
+bb3:
+  %tmp4 = phi i1 [ undef, %bb ], [ false, %bb2 ], [ true, %bb1 ]
+  ret i1 %tmp4
+; CHECK: define i1 @undef
+; CHECK: %switch.cast = trunc i32 %switch.tableidx to i9
+; CHECK: %switch.downshift = lshr i9 3, %switch.shiftamt
+}
+
+; Also handle large switches that would be rejected by
+; isValueEqualityComparison()
+; CHECK: large
+; CHECK-NOT: switch i32
+define i32 @large(i32 %x) {
+entry:
+  %cmp = icmp slt i32 %x, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %mul = mul i32 %x, -10
+  br label %if.end
+
+if.end:
+  %x.addr.0 = phi i32 [ %mul, %if.then ], [ %x, %entry ]
+  switch i32 %x.addr.0, label %return [
+    i32 199, label %sw.bb203
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+    i32 7, label %sw.bb7
+    i32 8, label %sw.bb8
+    i32 9, label %sw.bb9
+    i32 10, label %sw.bb10
+    i32 11, label %sw.bb11
+    i32 12, label %sw.bb12
+    i32 13, label %sw.bb13
+    i32 14, label %sw.bb14
+    i32 15, label %sw.bb15
+    i32 16, label %sw.bb16
+    i32 17, label %sw.bb17
+    i32 18, label %sw.bb18
+    i32 19, label %sw.bb19
+    i32 20, label %sw.bb20
+    i32 21, label %sw.bb21
+    i32 22, label %sw.bb22
+    i32 23, label %sw.bb23
+    i32 24, label %sw.bb24
+    i32 25, label %sw.bb25
+    i32 26, label %sw.bb26
+    i32 27, label %sw.bb27
+    i32 28, label %sw.bb28
+    i32 29, label %sw.bb29
+    i32 30, label %sw.bb30
+    i32 31, label %sw.bb31
+    i32 32, label %sw.bb32
+    i32 33, label %sw.bb33
+    i32 34, label %sw.bb34
+    i32 35, label %sw.bb35
+    i32 36, label %sw.bb37
+    i32 37, label %sw.bb38
+    i32 38, label %sw.bb39
+    i32 39, label %sw.bb40
+    i32 40, label %sw.bb41
+    i32 41, label %sw.bb42
+    i32 42, label %sw.bb43
+    i32 43, label %sw.bb44
+    i32 44, label %sw.bb45
+    i32 45, label %sw.bb47
+    i32 46, label %sw.bb48
+    i32 47, label %sw.bb49
+    i32 48, label %sw.bb50
+    i32 49, label %sw.bb51
+    i32 50, label %sw.bb52
+    i32 51, label %sw.bb53
+    i32 52, label %sw.bb54
+    i32 53, label %sw.bb55
+    i32 54, label %sw.bb56
+    i32 55, label %sw.bb58
+    i32 56, label %sw.bb59
+    i32 57, label %sw.bb60
+    i32 58, label %sw.bb61
+    i32 59, label %sw.bb62
+    i32 60, label %sw.bb63
+    i32 61, label %sw.bb64
+    i32 62, label %sw.bb65
+    i32 63, label %sw.bb66
+    i32 64, label %sw.bb67
+    i32 65, label %sw.bb68
+    i32 66, label %sw.bb69
+    i32 67, label %sw.bb70
+    i32 68, label %sw.bb71
+    i32 69, label %sw.bb72
+    i32 70, label %sw.bb73
+    i32 71, label %sw.bb74
+    i32 72, label %sw.bb76
+    i32 73, label %sw.bb77
+    i32 74, label %sw.bb78
+    i32 75, label %sw.bb79
+    i32 76, label %sw.bb80
+    i32 77, label %sw.bb81
+    i32 78, label %sw.bb82
+    i32 79, label %sw.bb83
+    i32 80, label %sw.bb84
+    i32 81, label %sw.bb85
+    i32 82, label %sw.bb86
+    i32 83, label %sw.bb87
+    i32 84, label %sw.bb88
+    i32 85, label %sw.bb89
+    i32 86, label %sw.bb90
+    i32 87, label %sw.bb91
+    i32 88, label %sw.bb92
+    i32 89, label %sw.bb93
+    i32 90, label %sw.bb94
+    i32 91, label %sw.bb95
+    i32 92, label %sw.bb96
+    i32 93, label %sw.bb97
+    i32 94, label %sw.bb98
+    i32 95, label %sw.bb99
+    i32 96, label %sw.bb100
+    i32 97, label %sw.bb101
+    i32 98, label %sw.bb102
+    i32 99, label %sw.bb103
+    i32 100, label %sw.bb104
+    i32 101, label %sw.bb105
+    i32 102, label %sw.bb106
+    i32 103, label %sw.bb107
+    i32 104, label %sw.bb108
+    i32 105, label %sw.bb109
+    i32 106, label %sw.bb110
+    i32 107, label %sw.bb111
+    i32 108, label %sw.bb112
+    i32 109, label %sw.bb113
+    i32 110, label %sw.bb114
+    i32 111, label %sw.bb115
+    i32 112, label %sw.bb116
+    i32 113, label %sw.bb117
+    i32 114, label %sw.bb118
+    i32 115, label %sw.bb119
+    i32 116, label %sw.bb120
+    i32 117, label %sw.bb121
+    i32 118, label %sw.bb122
+    i32 119, label %sw.bb123
+    i32 120, label %sw.bb124
+    i32 121, label %sw.bb125
+    i32 122, label %sw.bb126
+    i32 123, label %sw.bb127
+    i32 124, label %sw.bb128
+    i32 125, label %sw.bb129
+    i32 126, label %sw.bb130
+    i32 127, label %sw.bb131
+    i32 128, label %sw.bb132
+    i32 129, label %sw.bb133
+    i32 130, label %sw.bb134
+    i32 131, label %sw.bb135
+    i32 132, label %sw.bb136
+    i32 133, label %sw.bb137
+    i32 134, label %sw.bb138
+    i32 135, label %sw.bb139
+    i32 136, label %sw.bb140
+    i32 137, label %sw.bb141
+    i32 138, label %sw.bb142
+    i32 139, label %sw.bb143
+    i32 140, label %sw.bb144
+    i32 141, label %sw.bb145
+    i32 142, label %sw.bb146
+    i32 143, label %sw.bb147
+    i32 144, label %sw.bb148
+    i32 145, label %sw.bb149
+    i32 146, label %sw.bb150
+    i32 147, label %sw.bb151
+    i32 148, label %sw.bb152
+    i32 149, label %sw.bb153
+    i32 150, label %sw.bb154
+    i32 151, label %sw.bb155
+    i32 152, label %sw.bb156
+    i32 153, label %sw.bb157
+    i32 154, label %sw.bb158
+    i32 155, label %sw.bb159
+    i32 156, label %sw.bb160
+    i32 157, label %sw.bb161
+    i32 158, label %sw.bb162
+    i32 159, label %sw.bb163
+    i32 160, label %sw.bb164
+    i32 161, label %sw.bb165
+    i32 162, label %sw.bb166
+    i32 163, label %sw.bb167
+    i32 164, label %sw.bb168
+    i32 165, label %sw.bb169
+    i32 166, label %sw.bb170
+    i32 167, label %sw.bb171
+    i32 168, label %sw.bb172
+    i32 169, label %sw.bb173
+    i32 170, label %sw.bb174
+    i32 171, label %sw.bb175
+    i32 172, label %sw.bb176
+    i32 173, label %sw.bb177
+    i32 174, label %sw.bb178
+    i32 175, label %sw.bb179
+    i32 176, label %sw.bb180
+    i32 177, label %sw.bb181
+    i32 178, label %sw.bb182
+    i32 179, label %sw.bb183
+    i32 180, label %sw.bb184
+    i32 181, label %sw.bb185
+    i32 182, label %sw.bb186
+    i32 183, label %sw.bb187
+    i32 184, label %sw.bb188
+    i32 185, label %sw.bb189
+    i32 186, label %sw.bb190
+    i32 187, label %sw.bb191
+    i32 188, label %sw.bb192
+    i32 189, label %sw.bb193
+    i32 190, label %sw.bb194
+    i32 191, label %sw.bb195
+    i32 192, label %sw.bb196
+    i32 193, label %sw.bb197
+    i32 194, label %sw.bb198
+    i32 195, label %sw.bb199
+    i32 196, label %sw.bb200
+    i32 197, label %sw.bb201
+    i32 198, label %sw.bb202
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.bb7: br label %return
+sw.bb8: br label %return
+sw.bb9: br label %return
+sw.bb10: br label %return
+sw.bb11: br label %return
+sw.bb12: br label %return
+sw.bb13: br label %return
+sw.bb14: br label %return
+sw.bb15: br label %return
+sw.bb16: br label %return
+sw.bb17: br label %return
+sw.bb18: br label %return
+sw.bb19: br label %return
+sw.bb20: br label %return
+sw.bb21: br label %return
+sw.bb22: br label %return
+sw.bb23: br label %return
+sw.bb24: br label %return
+sw.bb25: br label %return
+sw.bb26: br label %return
+sw.bb27: br label %return
+sw.bb28: br label %return
+sw.bb29: br label %return
+sw.bb30: br label %return
+sw.bb31: br label %return
+sw.bb32: br label %return
+sw.bb33: br label %return
+sw.bb34: br label %return
+sw.bb35: br label %return
+sw.bb37: br label %return
+sw.bb38: br label %return
+sw.bb39: br label %return
+sw.bb40: br label %return
+sw.bb41: br label %return
+sw.bb42: br label %return
+sw.bb43: br label %return
+sw.bb44: br label %return
+sw.bb45: br label %return
+sw.bb47: br label %return
+sw.bb48: br label %return
+sw.bb49: br label %return
+sw.bb50: br label %return
+sw.bb51: br label %return
+sw.bb52: br label %return
+sw.bb53: br label %return
+sw.bb54: br label %return
+sw.bb55: br label %return
+sw.bb56: br label %return
+sw.bb58: br label %return
+sw.bb59: br label %return
+sw.bb60: br label %return
+sw.bb61: br label %return
+sw.bb62: br label %return
+sw.bb63: br label %return
+sw.bb64: br label %return
+sw.bb65: br label %return
+sw.bb66: br label %return
+sw.bb67: br label %return
+sw.bb68: br label %return
+sw.bb69: br label %return
+sw.bb70: br label %return
+sw.bb71: br label %return
+sw.bb72: br label %return
+sw.bb73: br label %return
+sw.bb74: br label %return
+sw.bb76: br label %return
+sw.bb77: br label %return
+sw.bb78: br label %return
+sw.bb79: br label %return
+sw.bb80: br label %return
+sw.bb81: br label %return
+sw.bb82: br label %return
+sw.bb83: br label %return
+sw.bb84: br label %return
+sw.bb85: br label %return
+sw.bb86: br label %return
+sw.bb87: br label %return
+sw.bb88: br label %return
+sw.bb89: br label %return
+sw.bb90: br label %return
+sw.bb91: br label %return
+sw.bb92: br label %return
+sw.bb93: br label %return
+sw.bb94: br label %return
+sw.bb95: br label %return
+sw.bb96: br label %return
+sw.bb97: br label %return
+sw.bb98: br label %return
+sw.bb99: br label %return
+sw.bb100: br label %return
+sw.bb101: br label %return
+sw.bb102: br label %return
+sw.bb103: br label %return
+sw.bb104: br label %return
+sw.bb105: br label %return
+sw.bb106: br label %return
+sw.bb107: br label %return
+sw.bb108: br label %return
+sw.bb109: br label %return
+sw.bb110: br label %return
+sw.bb111: br label %return
+sw.bb112: br label %return
+sw.bb113: br label %return
+sw.bb114: br label %return
+sw.bb115: br label %return
+sw.bb116: br label %return
+sw.bb117: br label %return
+sw.bb118: br label %return
+sw.bb119: br label %return
+sw.bb120: br label %return
+sw.bb121: br label %return
+sw.bb122: br label %return
+sw.bb123: br label %return
+sw.bb124: br label %return
+sw.bb125: br label %return
+sw.bb126: br label %return
+sw.bb127: br label %return
+sw.bb128: br label %return
+sw.bb129: br label %return
+sw.bb130: br label %return
+sw.bb131: br label %return
+sw.bb132: br label %return
+sw.bb133: br label %return
+sw.bb134: br label %return
+sw.bb135: br label %return
+sw.bb136: br label %return
+sw.bb137: br label %return
+sw.bb138: br label %return
+sw.bb139: br label %return
+sw.bb140: br label %return
+sw.bb141: br label %return
+sw.bb142: br label %return
+sw.bb143: br label %return
+sw.bb144: br label %return
+sw.bb145: br label %return
+sw.bb146: br label %return
+sw.bb147: br label %return
+sw.bb148: br label %return
+sw.bb149: br label %return
+sw.bb150: br label %return
+sw.bb151: br label %return
+sw.bb152: br label %return
+sw.bb153: br label %return
+sw.bb154: br label %return
+sw.bb155: br label %return
+sw.bb156: br label %return
+sw.bb157: br label %return
+sw.bb158: br label %return
+sw.bb159: br label %return
+sw.bb160: br label %return
+sw.bb161: br label %return
+sw.bb162: br label %return
+sw.bb163: br label %return
+sw.bb164: br label %return
+sw.bb165: br label %return
+sw.bb166: br label %return
+sw.bb167: br label %return
+sw.bb168: br label %return
+sw.bb169: br label %return
+sw.bb170: br label %return
+sw.bb171: br label %return
+sw.bb172: br label %return
+sw.bb173: br label %return
+sw.bb174: br label %return
+sw.bb175: br label %return
+sw.bb176: br label %return
+sw.bb177: br label %return
+sw.bb178: br label %return
+sw.bb179: br label %return
+sw.bb180: br label %return
+sw.bb181: br label %return
+sw.bb182: br label %return
+sw.bb183: br label %return
+sw.bb184: br label %return
+sw.bb185: br label %return
+sw.bb186: br label %return
+sw.bb187: br label %return
+sw.bb188: br label %return
+sw.bb189: br label %return
+sw.bb190: br label %return
+sw.bb191: br label %return
+sw.bb192: br label %return
+sw.bb193: br label %return
+sw.bb194: br label %return
+sw.bb195: br label %return
+sw.bb196: br label %return
+sw.bb197: br label %return
+sw.bb198: br label %return
+sw.bb199: br label %return
+sw.bb200: br label %return
+sw.bb201: br label %return
+sw.bb202: br label %return
+sw.bb203: br label %return
+
+return:
+  %retval.0 = phi i32 [ 39204, %sw.bb202 ], [ 38809, %sw.bb201 ], [ 38416, %sw.bb200 ], [ 38025, %sw.bb199 ], [ 37636, %sw.bb198 ], [ 37249, %sw.bb197 ], [ 36864, %sw.bb196 ], [ 36481, %sw.bb195 ], [ 36100, %sw.bb194 ], [ 35721, %sw.bb193 ], [ 35344, %sw.bb192 ], [ 34969, %sw.bb191 ], [ 34596, %sw.bb190 ], [ 34225, %sw.bb189 ], [ 33856, %sw.bb188 ], [ 33489, %sw.bb187 ], [ 33124, %sw.bb186 ], [ 32761, %sw.bb185 ], [ 32400, %sw.bb184 ], [ 32041, %sw.bb183 ], [ 31684, %sw.bb182 ], [ 31329, %sw.bb181 ], [ 30976, %sw.bb180 ], [ 30625, %sw.bb179 ], [ 30276, %sw.bb178 ], [ 29929, %sw.bb177 ], [ 29584, %sw.bb176 ], [ 29241, %sw.bb175 ], [ 28900, %sw.bb174 ], [ 28561, %sw.bb173 ], [ 28224, %sw.bb172 ], [ 27889, %sw.bb171 ], [ 27556, %sw.bb170 ], [ 27225, %sw.bb169 ], [ 26896, %sw.bb168 ], [ 26569, %sw.bb167 ], [ 26244, %sw.bb166 ], [ 25921, %sw.bb165 ], [ 25600, %sw.bb164 ], [ 25281, %sw.bb163 ], [ 24964, %sw.bb162 ], [ 24649, %sw.bb161 ], [ 24336, %sw.bb160 ], [ 24025, %sw.bb159 ], [ 23716, %sw.bb158 ], [ 23409, %sw.bb157 ], [ 23104, %sw.bb156 ], [ 22801, %sw.bb155 ], [ 22500, %sw.bb154 ], [ 22201, %sw.bb153 ], [ 21904, %sw.bb152 ], [ 21609, %sw.bb151 ], [ 21316, %sw.bb150 ], [ 21025, %sw.bb149 ], [ 20736, %sw.bb148 ], [ 20449, %sw.bb147 ], [ 20164, %sw.bb146 ], [ 19881, %sw.bb145 ], [ 19600, %sw.bb144 ], [ 19321, %sw.bb143 ], [ 19044, %sw.bb142 ], [ 18769, %sw.bb141 ], [ 18496, %sw.bb140 ], [ 18225, %sw.bb139 ], [ 17956, %sw.bb138 ], [ 17689, %sw.bb137 ], [ 17424, %sw.bb136 ], [ 17161, %sw.bb135 ], [ 16900, %sw.bb134 ], [ 16641, %sw.bb133 ], [ 16384, %sw.bb132 ], [ 16129, %sw.bb131 ], [ 15876, %sw.bb130 ], [ 15625, %sw.bb129 ], [ 15376, %sw.bb128 ], [ 15129, %sw.bb127 ], [ 14884, %sw.bb126 ], [ 14641, %sw.bb125 ], [ 14400, %sw.bb124 ], [ 14161, %sw.bb123 ], [ 13924, %sw.bb122 ], [ 13689, %sw.bb121 ], [ 13456, %sw.bb120 ], [ 13225, %sw.bb119 ], [ 12996, %sw.bb118 ], [ 12769, %sw.bb117 ], [ 12544, %sw.bb116 ], [ 12321, %sw.bb115 ], [ 12100, %sw.bb114 ], [ 11881, %sw.bb113 ], [ 11664, %sw.bb112 ], [ 11449, %sw.bb111 ], [ 11236, %sw.bb110 ], [ 11025, %sw.bb109 ], [ 10816, %sw.bb108 ], [ 10609, %sw.bb107 ], [ 10404, %sw.bb106 ], [ 10201, %sw.bb105 ], [ 10000, %sw.bb104 ], [ 9801, %sw.bb103 ], [ 9604, %sw.bb102 ], [ 9409, %sw.bb101 ], [ 9216, %sw.bb100 ], [ 9025, %sw.bb99 ], [ 8836, %sw.bb98 ], [ 8649, %sw.bb97 ], [ 8464, %sw.bb96 ], [ 8281, %sw.bb95 ], [ 8100, %sw.bb94 ], [ 7921, %sw.bb93 ], [ 7744, %sw.bb92 ], [ 7569, %sw.bb91 ], [ 7396, %sw.bb90 ], [ 7225, %sw.bb89 ], [ 7056, %sw.bb88 ], [ 6889, %sw.bb87 ], [ 6724, %sw.bb86 ], [ 6561, %sw.bb85 ], [ 6400, %sw.bb84 ], [ 6241, %sw.bb83 ], [ 6084, %sw.bb82 ], [ 5929, %sw.bb81 ], [ 5776, %sw.bb80 ], [ 5625, %sw.bb79 ], [ 5476, %sw.bb78 ], [ 5329, %sw.bb77 ], [ 5184, %sw.bb76 ], [ 5112, %sw.bb74 ], [ 4900, %sw.bb73 ], [ 4761, %sw.bb72 ], [ 4624, %sw.bb71 ], [ 4489, %sw.bb70 ], [ 4356, %sw.bb69 ], [ 4225, %sw.bb68 ], [ 4096, %sw.bb67 ], [ 3969, %sw.bb66 ], [ 3844, %sw.bb65 ], [ 3721, %sw.bb64 ], [ 3600, %sw.bb63 ], [ 3481, %sw.bb62 ], [ 3364, %sw.bb61 ], [ 3249, %sw.bb60 ], [ 3136, %sw.bb59 ], [ 3025, %sw.bb58 ], [ 2970, %sw.bb56 ], [ 2809, %sw.bb55 ], [ 2704, %sw.bb54 ], [ 2601, %sw.bb53 ], [ 2500, %sw.bb52 ], [ 2401, %sw.bb51 ], [ 2304, %sw.bb50 ], [ 2209, %sw.bb49 ], [ 2116, %sw.bb48 ], [ 2025, %sw.bb47 ], [ 1980, %sw.bb45 ], [ 1849, %sw.bb44 ], [ 1764, %sw.bb43 ], [ 1681, %sw.bb42 ], [ 1600, %sw.bb41 ], [ 1521, %sw.bb40 ], [ 1444, %sw.bb39 ], [ 1369, %sw.bb38 ], [ 1296, %sw.bb37 ], [ 1260, %sw.bb35 ], [ 1156, %sw.bb34 ], [ 1089, %sw.bb33 ], [ 1024, %sw.bb32 ], [ 961, %sw.bb31 ], [ 900, %sw.bb30 ], [ 841, %sw.bb29 ], [ 784, %sw.bb28 ], [ 729, %sw.bb27 ], [ 676, %sw.bb26 ], [ 625, %sw.bb25 ], [ 576, %sw.bb24 ], [ 529, %sw.bb23 ], [ 484, %sw.bb22 ], [ 441, %sw.bb21 ], [ 400, %sw.bb20 ], [ 361, %sw.bb19 ], [ 342, %sw.bb18 ], [ 289, %sw.bb17 ], [ 256, %sw.bb16 ], [ 225, %sw.bb15 ], [ 196, %sw.bb14 ], [ 169, %sw.bb13 ], [ 144, %sw.bb12 ], [ 121, %sw.bb11 ], [ 100, %sw.bb10 ], [ 81, %sw.bb9 ], [ 64, %sw.bb8 ], [ 49, %sw.bb7 ], [ 36, %sw.bb6 ], [ 25, %sw.bb5 ], [ 16, %sw.bb4 ], [ 9, %sw.bb3 ], [ 4, %sw.bb2 ], [ 1, %sw.bb1 ], [ 39601, %sw.bb203 ], [ 0, %if.end ]
+  ret i32 %retval.0
+}
+
+define i32 @cprop(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 1, label %return
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb2
+    i32 5, label %sw.bb2
+    i32 6, label %sw.bb3
+    i32 7, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+
+sw.bb2:
+  %and = and i32 %x, 1
+  %tobool = icmp ne i32 %and, 0
+  %cond = select i1 %tobool, i32 -123, i32 456
+  %sub = sub nsw i32 %x, %cond
+  br label %return
+
+sw.bb3:
+  %trunc = trunc i32 %x to i8
+  %sext = sext i8 %trunc to i32
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 123, %sw.default ], [ %sext, %sw.bb3 ], [ %sub, %sw.bb2 ], [ 42, %sw.bb1 ], [ 5, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @cprop
+; CHECK: switch.lookup:
+; CHECK: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table5, i32 0, i32 %switch.tableidx
+}
+
+define i32 @unreachable(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+    i32 4, label %sw.bb2
+    i32 5, label %sw.bb3
+    i32 6, label %sw.bb3
+    i32 7, label %sw.bb3
+    i32 8, label %sw.bb3
+  ]
+
+sw.bb: br label %return
+sw.bb1: unreachable
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: unreachable
+
+return:
+  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 0, %sw.bb ]
+  ret i32 %retval.0
+
+; CHECK: @unreachable
+; CHECK: switch.lookup:
+; CHECK: getelementptr inbounds [5 x i32]* @switch.table6, i32 0, i32 %switch.tableidx
+}
diff --git a/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll b/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
index 65d888ea01e10..028fb0745631f 100644
--- a/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
+++ b/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
@@ -85,3 +85,31 @@ if.end7:                                          ; preds = %if.else, %if.then4,
 ; CHECK: if.end7:
 ; CHECK: phi i32* [ %a, %if.then ], [ null, %if.then4 ], [ null, %if.else ]
 }
+
+define i32 @test4(i32* %a, i32 %b, i32* %c, i32 %d) nounwind {
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @bar() nounwind
+  br label %if.end7
+
+if.else:                                          ; preds = %entry
+  %tobool3 = icmp eq i32 %d, 0
+  br i1 %tobool3, label %if.end7, label %if.then4
+
+if.then4:                                         ; preds = %if.else
+  tail call void @bar() nounwind
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.else, %if.then4, %if.then
+  %x.0 = phi i32* [ %a, %if.then ], [ null, %if.then4 ], [ null, %if.else ]
+  %gep = getelementptr i32* %x.0, i32 10
+  %tmp9 = load i32* %gep
+  %tmp10 = or i32 %tmp9, 1
+  store i32 %tmp10, i32* %gep
+  ret i32 %tmp9
+; CHECK: @test4
+; CHECK-NOT: phi
+}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
new file mode 100644
index 0000000000000..53d5448372daf
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
@@ -0,0 +1,37 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+; This test case was written to trigger an incorrect assert statement in
+; -simplifycfg.  Thus we don't actually want to check the output, just that
+; -simplifycfg ran successfully.  Thus we only check that the function still
+; exists, and that it still calls foo().
+;
+; NOTE: There are some obviously dead blocks and missing branch weight
+;       metadata.  Both of these features were key to triggering the assert.
+;       Additionally, the not-taken weight of the branch with a weight had to
+;       be 0 to trigger the assert.
+
+declare void @foo() nounwind uwtable
+
+define void @func(i32 %A) nounwind uwtable {
+; CHECK: define void @func
+entry:
+  %cmp11 = icmp eq i32 %A, 1
+  br i1 %cmp11, label %if.then, label %if.else, !prof !0
+
+if.then:
+  call void @foo()
+; CHECK: call void @foo()
+  br label %if.else
+
+if.else:
+  %cmp17 = icmp eq i32 %A, 2
+  br i1 %cmp17, label %if.then2, label %if.end
+
+if.then2:
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 0}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
new file mode 100644
index 0000000000000..941f5ad9d5b64
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
@@ -0,0 +1,140 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+declare void @func2(i32)
+declare void @func4(i32)
+declare void @func6(i32)
+declare void @func8(i32)
+
+;; test1 - create a switch with case 2 and case 4 from two branches: N == 2
+;; and N == 4.
+define void @test1(i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp eq i32 %N, 2
+  br i1 %cmp, label %if.then, label %if.else, !prof !0
+; CHECK: test1
+; CHECK: switch i32 %N
+; CHECK: ], !prof !0
+
+if.then:
+  call void @func2(i32 %N) nounwind
+  br label %if.end9
+
+if.else:
+  %cmp2 = icmp eq i32 %N, 4
+  br i1 %cmp2, label %if.then7, label %if.else8, !prof !1
+
+if.then7:
+  call void @func4(i32 %N) nounwind
+  br label %if.end
+
+if.else8:
+  call void @func8(i32 %N) nounwind
+  br label %if.end
+
+if.end:
+  br label %if.end9
+
+if.end9:
+  ret void
+}
+
+;; test2 - Merge two switches where PredDefault == BB.
+define void @test2(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw2 [
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !2
+; CHECK: test2
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !1
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated if control is transferred through default case
+;; of the first switch.
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !3
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test3 - Merge two switches where PredDefault != BB.
+define void @test3(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw.bb [
+    i32 2, label %sw2
+    i32 3, label %sw2
+    i32 1, label %sw.bb1
+  ], !prof !4
+; CHECK: test3
+; CHECK: switch i32 %N, label %sw.bb
+; CHECK: i32 1, label %sw.bb1
+; CHECK: i32 3, label %sw.bb4
+; CHECK: i32 2, label %sw.epilog
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+  switch i32 %N, label %sw.epilog [
+    i32 3, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 256, i32 4352, i32 16}
+!2 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 8}
+!3 = metadata !{metadata !"branch_weights", i32 8, i32 8, i32 4}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 32, i32 48, i32 96, i32 16}
+!4 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 3}
+!5 = metadata !{metadata !"branch_weights", i32 17, i32 13, i32 9}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7, i32 3, i32 4, i32 6}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index c7917857ee605..beef527008204 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -79,10 +79,238 @@ Z:
   ret void
 }
 
+;; test5 - The case where it jumps to the default target will be removed.
+define void @test5(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !3
+; CHECK: test5
+; CHECK: switch i32 %N, label %sw2 [
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: ], !prof !2
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test6 - Some cases of the second switch are pruned during optimization.
+;; Then the second switch will be converted to a branch, finally, the first
+;; switch and the branch will be merged into a single switch.
+define void @test6(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !4
+; CHECK: test6
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated since the default case of the first switch
+;; does not include "case 2".
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @helper(i32 3)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; This test is based on test1 but swapped the targets of the second branch.
+define void @test1_swap(i1 %a, i1 %b) {
+; CHECK: @test1_swap
+entry:
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %or.cond, label %Y, label %Z, !prof !4
+
+X:
+  %c = or i1 %b, false
+  br i1 %c, label %Y, label %Z, !prof !1
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+define void @test7(i1 %a, i1 %b) {
+; CHECK: @test7
+entry:
+  %c = or i1 %b, false
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %brmerge, label %Y, label %Z, !prof !5
+
+X:
+  br i1 %c, label %Y, label %Z, !prof !6
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+; Test basic folding to a conditional branch.
+define void @test8(i64 %x, i64 %y) nounwind {
+; CHECK: @test8
+entry:
+    %lt = icmp slt i64 %x, %y
+; CHECK: br i1 %lt, label %a, label %b, !prof !6
+    %qux = select i1 %lt, i32 0, i32 2
+    switch i32 %qux, label %bees [
+        i32 0, label %a
+        i32 1, label %b
+        i32 2, label %b
+    ], !prof !7
+a:
+    call void @helper(i32 0) nounwind
+    ret void
+b:
+    call void @helper(i32 1) nounwind
+    ret void
+bees:
+    call void @helper(i32 2) nounwind
+    ret void
+}
+
+; Test edge splitting when the default target has icmp and unconditinal
+; branch
+define i1 @test9(i32 %x, i32 %y) nounwind {
+; CHECK: @test9
+entry:
+    switch i32 %x, label %bees [
+        i32 0, label %a
+        i32 1, label %end
+        i32 2, label %end
+    ], !prof !7
+; CHECK: switch i32 %x, label %bees [
+; CHECK: i32 0, label %a
+; CHECK: i32 1, label %end
+; CHECK: i32 2, label %end
+; CHECK: i32 92, label %end
+; CHECK: ], !prof !7
+
+a:
+    call void @helper(i32 0) nounwind
+    %reta = icmp slt i32 %x, %y
+    ret i1 %reta
+
+bees:
+    %tmp = icmp eq i32 %x, 92
+    br label %end
+
+end:
+; CHECK: end:
+; CHECK: %ret = phi i1 [ true, %entry ], [ false, %bees ], [ true, %entry ], [ true, %entry ]
+    %ret = phi i1 [ true, %entry ], [%tmp, %bees], [true, %entry]
+    call void @helper(i32 2) nounwind
+    ret i1 %ret
+}
+
+define void @test10(i32 %x) nounwind readnone ssp noredzone {
+entry:
+ switch i32 %x, label %lor.rhs [
+   i32 2, label %lor.end
+   i32 1, label %lor.end
+   i32 3, label %lor.end
+ ], !prof !7
+
+lor.rhs:
+ call void @helper(i32 1) nounwind
+ ret void
+
+lor.end:
+ call void @helper(i32 0) nounwind
+ ret void
+
+; CHECK: test10
+; CHECK: %x.off = add i32 %x, -1
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %lor.end, label %lor.rhs, !prof !8
+}
+
+; Remove dead cases from the switch.
+define void @test11(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 24, label %c
+  ], !prof !8
+; CHECK: %cond = icmp eq i32 %i, 24
+; CHECK: br i1 %cond, label %c, label %a, !prof !9
+
+a:
+ call void @helper(i32 0) nounwind
+ ret void
+b:
+ call void @helper(i32 1) nounwind
+ ret void
+c:
+ call void @helper(i32 2) nounwind
+ ret void
+}
+
 !0 = metadata !{metadata !"branch_weights", i32 3, i32 5}
 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
 !2 = metadata !{metadata !"branch_weights", i32 1, i32 2}
+!3 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!4 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!5 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 5}
+!6 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!7 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8, i32 7}
+!8 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8}
 
 ; CHECK: !0 = metadata !{metadata !"branch_weights", i32 5, i32 11}
 ; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 5}
-; CHECK-NOT: !2
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 7, i32 1, i32 2}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 49, i32 12, i32 24, i32 35}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 11, i32 5}
+; CHECK: !5 = metadata !{metadata !"branch_weights", i32 17, i32 15} 
+; CHECK: !6 = metadata !{metadata !"branch_weights", i32 9, i32 7}
+; CHECK: !7 = metadata !{metadata !"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
+; CHECK: !8 = metadata !{metadata !"branch_weights", i32 24, i32 33}
+; CHECK: !9 = metadata !{metadata !"branch_weights", i32 8, i32 33}
+; CHECK-NOT: !9
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
new file mode 100644
index 0000000000000..28d727938288e
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
diff --git a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll b/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
deleted file mode 100644
index 2717228f7ee1d..0000000000000
--- a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; Test that we add nocapture to the declaration, and to the second call only.
-
-; CHECK: declare float @strtol(i8*, i8** nocapture, i32) nounwind
-declare float @strtol(i8* %s, i8** %endptr, i32 %base)
-
-define void @foo(i8* %x, i8** %endptr) {
-; CHECK:  call float @strtol(i8* %x, i8** %endptr, i32 10)
-  call float @strtol(i8* %x, i8** %endptr, i32 10)
-; CHECK: %2 = call float @strtol(i8* nocapture %x, i8** null, i32 10)
-  call float @strtol(i8* %x, i8** null, i32 10)
-  ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/FFS.ll b/test/Transforms/SimplifyLibCalls/FFS.ll
index e38d78349d43b..6aecbeacd7e6b 100644
--- a/test/Transforms/SimplifyLibCalls/FFS.ll
+++ b/test/Transforms/SimplifyLibCalls/FFS.ll
@@ -1,6 +1,7 @@
-; Test that the ToAsciiOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*@ffs"
+; Test that FFSOpt works correctly
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
+
+; CHECK-NOT: call{{.*}}@ffs
 
 @non_const = external global i32		; <i32*> [#uses=1]
 
@@ -34,3 +35,11 @@ define i32 @a(i64) nounwind {
         %2 = call i32 @ffsll(i64 %0)            ; <i32> [#uses=1]
         ret i32 %2
 }
+
+; PR13028
+define i32 @b() nounwind {
+  %ffs = call i32 @ffsll(i64 0)
+  ret i32 %ffs
+; CHECK: @b
+; CHECK-NEXT: ret i32 0
+}
diff --git a/test/Transforms/SimplifyLibCalls/StpCpy.ll b/test/Transforms/SimplifyLibCalls/StpCpy.ll
deleted file mode 100644
index 914b0955bc90b..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StpCpy.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; Test that the StpCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @stpcpy(i8*, i8*)
-
-declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @stpcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__stpcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
-
-define i8* @t3(i8* %arg) {
-; CHECK: @t3
-  %stpcpy = tail call i8* @stpcpy(i8* %arg, i8* %arg)
-; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen(i8* %arg)
-; CHECK-NEXT: getelementptr inbounds i8* %arg, i32 [[LEN]]
-  ret i8* %stpcpy
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrCat.ll b/test/Transforms/SimplifyLibCalls/StrCat.ll
deleted file mode 100644
index 3ea691a3cfbe2..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrCat.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; Test that the StrCatOptimizer works correctly
-; PR3661
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strcat"
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep "puts.*%arg1"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strcat(i8*, i8*)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strcat( i8* %arg1, i8* %arg2 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strcat( i8* %rslt1, i8* %arg3 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strcat( i8* %rslt2, i8* %arg4 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/StrChr.ll b/test/Transforms/SimplifyLibCalls/StrChr.ll
deleted file mode 100644
index eaabeb2feb8f9..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrChr.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; Test that the StrChrOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [14 x i8] c"hello world\5Cn\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strchr(i8*, i32)
-
-define i32 @foo(i32 %index) {
-	%hello_p = getelementptr [14 x i8]* @hello, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%world = call i8* @strchr(i8* %hello_p, i32 119)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%ignore = call i8* @strchr(i8* %null_p, i32 119)
-; CHECK-NOT: call i8* strchr
-	%null = call i8* @strchr(i8* %hello_p, i32 0)
-; CHECK: getelementptr i8* %hello_p, i64 13
-	%result = call i8* @strchr(i8* %hello_p, i32 %index)
-; CHECK: call i8* @memchr(i8* %hello_p, i32 %index, i64 14)
-	ret i32 %index
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/StrCmp.ll b/test/Transforms/SimplifyLibCalls/StrCmp.ll
deleted file mode 100644
index 60854d76c97ae..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrCmp.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; Test that the StrCmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@hell = constant [5 x i8] c"hell\00"		; <[5 x i8]*> [#uses=1]
-@bell = constant [5 x i8] c"bell\00"		; <[5 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-
-declare i32 @strcmp(i8*, i8*)
-
-; strcmp("", x) -> -*x
-define i32 @test1(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i8* %str)
-  ret i32 %temp1
-  ; CHECK: @test1
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %1 = zext i8 %strcmpload to i32
-  ; CHECK: %temp1 = sub i32 0, %1
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, "") -> *x
-define i32 @test2(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* %str, i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test2
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %temp1 = zext i8 %strcmpload to i32
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, y)  -> cnst
-define i32 @test3() {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test3
-  ; CHECK: ret i32 -1
-}
-define i32 @test4() {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test4
-  ; CHECK: ret i32 1
-}
-
-; strcmp(x, y)   -> memcmp(x, y, <known length>)
-; (This transform is rather difficult to trigger in a useful manner)
-define i32 @test5(i1 %b) {
-  %sel = select i1 %b, i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([5 x i8]* @bell, i32 0, i32 0)
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %sel)
-  ret i32 %temp1
-  ; CHECK: @test5
-  ; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %sel, i32 5)
-  ; CHECK: ret i32 %memcmp
-}
-
-; strcmp(x,x)  -> 0
-define i32 @test6(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* %str, i8* %str)
-  ret i32 %temp1
-  ; CHECK: @test6
-  ; CHECK: ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrCpy.ll b/test/Transforms/SimplifyLibCalls/StrCpy.ll
deleted file mode 100644
index 83406ff8f868a..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrCpy.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; Test that the StrCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @strcpy(i8*, i8*)
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-; rdar://6839935
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__strcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrLen.ll b/test/Transforms/SimplifyLibCalls/StrLen.ll
deleted file mode 100644
index 4a20bbd2ce81d..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrLen.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; Test that the StrCatOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:    not grep "call.*strlen"
-
-target datalayout = "e-p:32:32"
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=3]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=3]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-@nullstring = constant i8 0
-
-declare i32 @strlen(i8*)
-
-define i32 @test1() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	ret i32 %hello_l
-}
-
-define i32 @test2() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	ret i32 %null_l
-}
-
-define i32 @test3() {
-	%null_hello_p = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_hello_l = call i32 @strlen( i8* %null_hello_p )		; <i32> [#uses=1]
-	ret i32 %null_hello_l
-}
-
-define i1 @test4() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	%eq_hello = icmp eq i32 %hello_l, 0		; <i1> [#uses=1]
-	ret i1 %eq_hello
-}
-
-define i1 @test5() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	%eq_null = icmp eq i32 %null_l, 0		; <i1> [#uses=1]
-	ret i1 %eq_null
-}
-
-define i1 @test6() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	%ne_hello = icmp ne i32 %hello_l, 0		; <i1> [#uses=1]
-	ret i1 %ne_hello
-}
-
-define i1 @test7() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	%ne_null = icmp ne i32 %null_l, 0		; <i1> [#uses=1]
-	ret i1 %ne_null
-}
-
-define i32 @test8() {
-	%len = tail call i32 @strlen(i8* @nullstring) nounwind
-	ret i32 %len
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCat.ll b/test/Transforms/SimplifyLibCalls/StrNCat.ll
deleted file mode 100644
index 073792b96a1bb..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrNCat.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; Test that the StrNCatOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strncat"
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep "puts.*%arg1"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strncat(i8*, i8*, i32)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strncat( i8* %arg1, i8* %arg2, i32 6 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strncat( i8* %rslt1, i8* %arg3, i32 42 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strncat( i8* %rslt2, i8* %arg4, i32 42 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCmp.ll b/test/Transforms/SimplifyLibCalls/StrNCmp.ll
deleted file mode 100644
index 0b2a501a3c8a7..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrNCmp.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; Test that the StrCmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@hell = constant [5 x i8] c"hell\00"		; <[5 x i8]*> [#uses=1]
-@bell = constant [5 x i8] c"bell\00"		; <[5 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-
-declare i32 @strncmp(i8*, i8*, i32)
-
-; strcmp("", x) -> -*x
-define i32 @test1(i8* %str) {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i8* %str, i32 10)
-  ret i32 %temp1
-  ; CHECK: @test1
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %1 = zext i8 %strcmpload to i32
-  ; CHECK: %temp1 = sub i32 0, %1
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, "") -> *x
-define i32 @test2(i8* %str) {
-  %temp1 = call i32 @strncmp(i8* %str, i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test2
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %temp1 = zext i8 %strcmpload to i32
-  ; CHECK: ret i32 %temp1
-}
-
-; strncmp(x, y, n)  -> cnst
-define i32 @test3() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test3
-  ; CHECK: ret i32 -1
-}
-define i32 @test4() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test4
-  ; CHECK: ret i32 1
-}
-define i32 @test5() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i32 4)
-  ret i32 %temp1
-  ; CHECK: @test5
-  ; CHECK: ret i32 0
-}
-
-; strncmp(x,y,1) -> memcmp(x,y,1)
-define i32 @test6(i8* %str1, i8* %str2) {
-  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
-  ret i32 %temp1
-  ; CHECK: @test6
-  ; CHECK: load i8*
-  ; CHECK: load i8*
-  ; CHECK: sub i32
-}
-
-; strncmp(x,y,0)   -> 0
-define i32 @test7(i8* %str1, i8* %str2) {
-  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 0)
-  ret i32 %temp1
-  ; CHECK: @test7
-  ; CHECK: ret i32 0
-}
-
-; strncmp(x,x,n)  -> 0
-define i32 @test8(i8* %str, i32 %n) {
-  %temp1 = call i32 @strncmp(i8* %str, i8* %str, i32 %n)
-  ret i32 %temp1
-  ; CHECK: @test8
-  ; CHECK: ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCpy.ll b/test/Transforms/SimplifyLibCalls/StrNCpy.ll
deleted file mode 100644
index 4e47b31a6afa9..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrNCpy.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; Test that the StrNCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strncpy"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strncpy(i8*, i8*, i32)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strncpy( i8* %arg1, i8* %arg2, i32 6 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strncpy( i8* %rslt1, i8* %arg3, i32 42 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strncpy( i8* %rslt2, i8* %arg4, i32 42 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrPBrk.ll b/test/Transforms/SimplifyLibCalls/StrPBrk.ll
deleted file mode 100644
index 29c3b7477b47f..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrPBrk.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@hello = constant [12 x i8] c"hello world\00"
-@w = constant [2 x i8] c"w\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strpbrk(i8*, i8*)
-
-define void @test(i8* %s1, i8* %s2) {
-	%hello_p = getelementptr [12 x i8]* @hello, i32 0, i32 0
-	%w_p = getelementptr [2 x i8]* @w, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i8* @strpbrk(i8* %null_p, i8* %s2)
-	%test2 = call i8* @strpbrk(i8* %s1, i8* %null_p)
-; CHECK-NOT: call i8* @strpbrk
-	%test3 = call i8* @strpbrk(i8* %s1, i8* %w_p)
-; CHECK: call i8* @strchr(i8* %s1, i32 119)
-	%test4 = call i8* @strpbrk(i8* %hello_p, i8* %w_p)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%test5 = call i8* @strpbrk(i8* %s1, i8* %s2)
-; CHECK: call i8* @strpbrk(i8* %s1, i8* %s2)
-	ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrRChr.ll b/test/Transforms/SimplifyLibCalls/StrRChr.ll
deleted file mode 100644
index 2259fc0289fb8..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrRChr.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; Test that the StrRChrOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@hello = constant [14 x i8] c"hello world\5Cn\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strrchr(i8*, i32)
-
-define void @foo(i8* %bar) {
-	%hello_p = getelementptr [14 x i8]* @hello, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%world = call i8* @strrchr(i8* %hello_p, i32 119)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%ignore = call i8* @strrchr(i8* %null_p, i32 119)
-; CHECK-NOT: call i8* strrchr
-	%null = call i8* @strrchr(i8* %hello_p, i32 0)
-; CHECK: getelementptr i8* %hello_p, i64 13
-	%strchr = call i8* @strrchr(i8* %bar, i32 0)
-; CHECK: call i8* @strchr(i8* %bar, i32 0)
-	ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrSpn.ll b/test/Transforms/SimplifyLibCalls/StrSpn.ll
deleted file mode 100644
index 800c19088337a..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrSpn.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@abcba = constant [6 x i8] c"abcba\00"
-@abc = constant [4 x i8] c"abc\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i64 @strspn(i8*, i8*)
-
-define i64 @testspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strspn(i8* %s1, i8* %null_p)
-	%test2 = call i64 @strspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strspn
-	%test4 = call i64 @strspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strspn(i8* %s1, i8* %s2)
-	ret i64 %test3
-; CHECK: ret i64 5
-}
-
-declare i64 @strcspn(i8*, i8*)
-
-define i64 @testcspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strcspn(i8* %s1, i8* %null_p)
-; CHECK: call i64 @strlen(i8* %s1)
-	%test2 = call i64 @strcspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strcspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strcspn
-	%test4 = call i64 @strcspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strcspn(i8* %s1, i8* %s2)
-        %add0 = add i64 %test1, %test3
-; CHECK: add i64 %{{.+}}, 0
-	ret i64 %add0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrStr.ll b/test/Transforms/SimplifyLibCalls/StrStr.ll
deleted file mode 100644
index eefd2e8006ab8..0000000000000
--- a/test/Transforms/SimplifyLibCalls/StrStr.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR5783
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-darwin9.0"
-
-@.str = private constant [1 x i8] zeroinitializer ; <[1 x i8]*> [#uses=1]
-@.str1 = private constant [2 x i8] c"a\00"        ; <[2 x i8]*> [#uses=1]
-@.str2 = private constant [6 x i8] c"abcde\00"    ; <[6 x i8]*> [#uses=1]
-@.str3 = private constant [4 x i8] c"bcd\00"      ; <[4 x i8]*> [#uses=1]
-
-define i8* @test1(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "") -> P
-; CHECK: @test1
-; CHECK: ret i8* %P
-}
-
-declare i8* @strstr(i8*, i8* nocapture) nounwind readonly
-
-define i8* @test2(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([2 x i8]* @.str1, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "a") -> strchr(P, 'a')
-; CHECK: @test2
-; CHECK: @strchr(i8* %P, i32 97)
-}
-
-define i8* @test3(i8* nocapture %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* getelementptr inbounds ([6 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr("abcde", "bcd") -> "abcde"+1
-; CHECK: @test3
-; CHECK: getelementptr inbounds ([6 x i8]* @.str2, i32 0, i64 1)
-}
-
-define i8* @test4(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %P) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, P) -> P
-; CHECK: @test4
-; CHECK: ret i8* %P
-}
-
-define i1 @test5(i8* %P, i8* %Q) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %Q) nounwind ; <i8*> [#uses=1]
-  %cmp = icmp eq i8* %call, %P
-  ret i1 %cmp
-; CHECK: @test5
-; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %Q)
-; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %P, i8* %Q, {{i[0-9]+}} [[LEN]])
-; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
-; CHECK: ret i1
-}
diff --git a/test/Transforms/SimplifyLibCalls/double-float-shrink.ll b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
new file mode 100644
index 0000000000000..b4ab8b4ceb9d9
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
@@ -0,0 +1,333 @@
+; RUN: opt  < %s -simplify-libcalls -enable-double-float-shrink -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @acos_test(float %f) nounwind readnone {
+; CHECK: acos_test
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acosf(float %f)
+}
+
+define double @acos_test2(float %f) nounwind readnone {
+; CHECK: acos_test2
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    ret double %call
+; CHECK: call double @acos(double %conv)
+}
+
+define float @acosh_test(float %f) nounwind readnone {
+; CHECK: acosh_test
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acoshf(float %f)
+}
+
+define double @acosh_test2(float %f) nounwind readnone {
+; CHECK: acosh_test2
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    ret double %call
+; CHECK: call double @acosh(double %conv)
+}
+
+define float @asin_test(float %f) nounwind readnone {
+; CHECK: asin_test
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinf(float %f)
+}
+
+define double @asin_test2(float %f) nounwind readnone {
+; CHECK: asin_test2
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    ret double %call
+; CHECK: call double @asin(double %conv)
+}
+
+define float @asinh_test(float %f) nounwind readnone {
+; CHECK: asinh_test
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinhf(float %f)
+}
+
+define double @asinh_test2(float %f) nounwind readnone {
+; CHECK: asinh_test2
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    ret double %call
+; CHECK: call double @asinh(double %conv)
+}
+
+define float @atan_test(float %f) nounwind readnone {
+; CHECK: atan_test
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanf(float %f)
+}
+
+define double @atan_test2(float %f) nounwind readnone {
+; CHECK: atan_test2
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    ret double %call
+; CHECK: call double @atan(double %conv)
+}
+define float @atanh_test(float %f) nounwind readnone {
+; CHECK: atanh_test
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanhf(float %f)
+}
+
+define double @atanh_test2(float %f) nounwind readnone {
+; CHECK: atanh_test2
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    ret double %call
+; CHECK: call double @atanh(double %conv)
+}
+define float @cbrt_test(float %f) nounwind readnone {
+; CHECK: cbrt_test
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @cbrtf(float %f)
+}
+
+define double @cbrt_test2(float %f) nounwind readnone {
+; CHECK: cbrt_test2
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    ret double %call
+; CHECK: call double @cbrt(double %conv)
+}
+define float @exp_test(float %f) nounwind readnone {
+; CHECK: exp_test
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expf(float %f)
+}
+
+define double @exp_test2(float %f) nounwind readnone {
+; CHECK: exp_test2
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    ret double %call
+; CHECK: call double @exp(double %conv)
+}
+define float @expm1_test(float %f) nounwind readnone {
+; CHECK: expm1_test
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expm1f(float %f)
+}
+
+define double @expm1_test2(float %f) nounwind readnone {
+; CHECK: expm1_test2
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    ret double %call
+; CHECK: call double @expm1(double %conv)
+}
+define float @exp10_test(float %f) nounwind readnone {
+; CHECK: exp10_test
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @exp10f(float %f)
+}
+
+define double @exp10_test2(float %f) nounwind readnone {
+; CHECK: exp10_test2
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    ret double %call
+; CHECK: call double @exp10(double %conv)
+}
+define float @log_test(float %f) nounwind readnone {
+; CHECK: log_test
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logf(float %f)
+}
+
+define double @log_test2(float %f) nounwind readnone {
+; CHECK: log_test2
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    ret double %call
+; CHECK: call double @log(double %conv)
+}
+define float @log10_test(float %f) nounwind readnone {
+; CHECK: log10_test
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log10f(float %f)
+}
+
+define double @log10_test2(float %f) nounwind readnone {
+; CHECK: log10_test2
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    ret double %call
+; CHECK: call double @log10(double %conv)
+}
+define float @log1p_test(float %f) nounwind readnone {
+; CHECK: log1p_test
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log1pf(float %f)
+}
+
+define double @log1p_test2(float %f) nounwind readnone {
+; CHECK: log1p_test2
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    ret double %call
+; CHECK: call double @log1p(double %conv)
+}
+define float @log2_test(float %f) nounwind readnone {
+; CHECK: log2_test
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log2f(float %f)
+}
+
+define double @log2_test2(float %f) nounwind readnone {
+; CHECK: log2_test2
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    ret double %call
+; CHECK: call double @log2(double %conv)
+}
+define float @logb_test(float %f) nounwind readnone {
+; CHECK: logb_test
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logbf(float %f)
+}
+
+define double @logb_test2(float %f) nounwind readnone {
+; CHECK: logb_test2
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    ret double %call
+; CHECK: call double @logb(double %conv)
+}
+define float @sin_test(float %f) nounwind readnone {
+; CHECK: sin_test
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sinf(float %f)
+}
+
+define double @sin_test2(float %f) nounwind readnone {
+; CHECK: sin_test2
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    ret double %call
+; CHECK: call double @sin(double %conv)
+}
+define float @sqrt_test(float %f) nounwind readnone {
+; CHECK: sqrt_test
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sqrtf(float %f)
+}
+
+define double @sqrt_test2(float %f) nounwind readnone {
+; CHECK: sqrt_test2
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    ret double %call
+; CHECK: call double @sqrt(double %conv)
+}
+define float @tan_test(float %f) nounwind readnone {
+; CHECK: tan_test
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanf(float %f)
+}
+
+define double @tan_test2(float %f) nounwind readnone {
+; CHECK: tan_test2
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    ret double %call
+; CHECK: call double @tan(double %conv)
+}
+define float @tanh_test(float %f) nounwind readnone {
+; CHECK: tanh_test
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanhf(float %f)
+}
+
+define double @tanh_test2(float %f) nounwind readnone {
+; CHECK: tanh_test2
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    ret double %call
+; CHECK: call double @tanh(double %conv)
+}
+
+declare double @tanh(double) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @sqrt(double) nounwind readnone
+declare double @sin(double) nounwind readnone
+declare double @log2(double) nounwind readnone
+declare double @log1p(double) nounwind readnone
+declare double @log10(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @logb(double) nounwind readnone
+declare double @exp10(double) nounwind readnone
+declare double @expm1(double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @cbrt(double) nounwind readnone
+declare double @atanh(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @acos(double) nounwind readnone
+declare double @acosh(double) nounwind readnone
+declare double @asin(double) nounwind readnone
+declare double @asinh(double) nounwind readnone
diff --git a/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
new file mode 100644
index 0000000000000..aecb887beb3a9
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
@@ -0,0 +1,179 @@
+; RUN: opt -S -simplify-libcalls -instcombine %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @test1(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @ceil(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test1
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test2(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @fabs(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test2
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test3(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @floor(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test3
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test4(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @nearbyint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test4
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test5(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @rint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test5
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test6(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @round(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test6
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test7(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @trunc(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test7
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+define i32 @test8(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @ceil(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test8
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test9(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @fabs(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test9
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test10(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @floor(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test10
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test11(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @nearbyint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test11
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test12(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @rint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test12
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test13(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @round(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test13
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test14(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @trunc(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test14
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+declare double @fabs(double) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare double @nearbyint(double) nounwind readnone
+declare double @rint(double) nounwind readnone
+declare double @round(double) nounwind readnone
+declare double @trunc(double) nounwind readnone
diff --git a/test/Transforms/SimplifyLibCalls/floor.ll b/test/Transforms/SimplifyLibCalls/floor.ll
index 03dcdf585f9a7..93c62c20023db 100644
--- a/test/Transforms/SimplifyLibCalls/floor.ll
+++ b/test/Transforms/SimplifyLibCalls/floor.ll
@@ -9,6 +9,8 @@
 ; DO-SIMPLIFY: call float @ceilf(
 ; DO-SIMPLIFY: call float @roundf(
 ; DO-SIMPLIFY: call float @nearbyintf(
+; DO-SIMPLIFY: call float @truncf(
+; DO-SIMPLIFY: call float @fabsf(
 
 ; C89-SIMPLIFY: call float @floorf(
 ; C89-SIMPLIFY: call float @ceilf(
@@ -19,6 +21,8 @@
 ; DONT-SIMPLIFY: call double @ceil(
 ; DONT-SIMPLIFY: call double @round(
 ; DONT-SIMPLIFY: call double @nearbyint(
+; DONT-SIMPLIFY: call double @trunc(
+; DONT-SIMPLIFY: call double @fabs(
 
 declare double @floor(double)
 
@@ -28,6 +32,10 @@ declare double @round(double)
 
 declare double @nearbyint(double)
 
+declare double @trunc(double)
+
+declare double @fabs(double)
+
 define float @test_floor(float %C) {
 	%D = fpext float %C to double		; <double> [#uses=1]
         ; --> floorf
@@ -60,3 +68,18 @@ define float @test_nearbyint(float %C) {
 	ret float %F
 }
 
+define float @test_trunc(float %C) {
+	%D = fpext float %C to double
+	; --> truncf
+        %E = call double @trunc(double %D)
+	%F = fptrunc double %E to float
+	ret float %F
+}
+
+define float @test_fabs(float %C) {
+	%D = fpext float %C to double
+	; --> fabsf
+        %E = call double @fabs(double %D)
+	%F = fptrunc double %E to float
+	ret float %F
+}
diff --git a/test/Transforms/SimplifyLibCalls/memcmp.ll b/test/Transforms/SimplifyLibCalls/memcmp.ll
deleted file mode 100644
index 6ca4dc97a194e..0000000000000
--- a/test/Transforms/SimplifyLibCalls/memcmp.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; Test that the memcmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-@h = constant [2 x i8] c"h\00"		; <[2 x i8]*> [#uses=0]
-@hel = constant [4 x i8] c"hel\00"		; <[4 x i8]*> [#uses=0]
-@hello_u = constant [8 x i8] c"hello_u\00"		; <[8 x i8]*> [#uses=0]
-
-declare i32 @memcmp(i8*, i8*, i32)
-
-define void @test(i8* %P, i8* %Q, i32 %N, i32* %IP, i1* %BP) {
-	%A = call i32 @memcmp( i8* %P, i8* %P, i32 %N )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %A, i32* %IP
-	%B = call i32 @memcmp( i8* %P, i8* %Q, i32 0 )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %B, i32* %IP
-	%C = call i32 @memcmp( i8* %P, i8* %Q, i32 1 )		; <i32> [#uses=1]
-; CHECK: load
-; CHECK: zext
-; CHECK: load
-; CHECK: zext
-; CHECK: sub
-; CHECK: store volatile
-	store volatile i32 %C, i32* %IP
-  %F = call i32 @memcmp(i8* getelementptr ([4 x i8]* @hel, i32 0, i32 0),
-                        i8* getelementptr ([8 x i8]* @hello_u, i32 0, i32 0),
-                        i32 3)
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-  store volatile i32 %F, i32* %IP
-	ret void
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/memmove.ll b/test/Transforms/SimplifyLibCalls/memmove.ll
deleted file mode 100644
index 5aaeeeb024f7f..0000000000000
--- a/test/Transforms/SimplifyLibCalls/memmove.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memmove"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i8* %b, i32 %x) {
-entry:
-	%call = call i8* @memmove(i8* %a, i8* %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memmove(i8*,i8*,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset-64.ll b/test/Transforms/SimplifyLibCalls/memset-64.ll
deleted file mode 100644
index 92412dee71ad6..0000000000000
--- a/test/Transforms/SimplifyLibCalls/memset-64.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-pc-linux-gnu"
-
-define void @a(i8* %x) nounwind {
-entry:
-	%call = call i8* @memset(i8* %x, i32 1, i64 100)		; <i8*> [#uses=0]
-	ret void
-}
-
-declare i8* @memset(i8*, i32, i64)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset.ll b/test/Transforms/SimplifyLibCalls/memset.ll
deleted file mode 100644
index 853215a4d24cc..0000000000000
--- a/test/Transforms/SimplifyLibCalls/memset.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i32 %b, i32 %x) {
-entry:
-	%call = call i8* @memset(i8* %a, i32 %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memset(i8*,i32,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/weak-symbols.ll b/test/Transforms/SimplifyLibCalls/weak-symbols.ll
deleted file mode 100644
index 5875b211f7767..0000000000000
--- a/test/Transforms/SimplifyLibCalls/weak-symbols.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR4738
-
-; SimplifyLibcalls shouldn't assume anything about weak symbols.
-
-@real_init = weak_odr constant [2 x i8] c"y\00"
-@fake_init = weak constant [2 x i8] c"y\00"
-@.str = private constant [2 x i8] c"y\00"
-
-; CHECK: define i32 @foo
-; CHECK: call i32 @strcmp
-define i32 @foo() nounwind {
-entry:
-  %t0 = call i32 @strcmp(i8* getelementptr inbounds ([2 x i8]* @fake_init, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) nounwind readonly
-  ret i32 %t0
-}
-
-; CHECK: define i32 @bar
-; CHECK: ret i32 0
-define i32 @bar() nounwind {
-entry:
-  %t0 = call i32 @strcmp(i8* getelementptr inbounds ([2 x i8]* @real_init, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) nounwind readonly
-  ret i32 %t0
-}
-
-declare i32 @strcmp(i8*, i8*) nounwind readonly
diff --git a/test/Verifier/invoke.ll b/test/Verifier/invoke.ll
index a48f9b60feb47..c2750bb121f28 100644
--- a/test/Verifier/invoke.ll
+++ b/test/Verifier/invoke.ll
@@ -19,7 +19,6 @@ L2:		; preds = %0
 	br label %L
 L:		; preds = %L2, %L1, %L1
 ; CHECK: The unwind destination does not have a landingpad instruction
-; CHECK: Instruction does not dominate all uses
 	ret i32 %A
 }
 
diff --git a/test/lit.cfg b/test/lit.cfg
index 6f44bb3d8c668..79eaa23c8ba97 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -5,6 +5,7 @@
 import os
 import sys
 import re
+import platform
 
 # name: The name of this test suite.
 config.name = 'LLVM'
@@ -139,9 +140,22 @@ if config.test_exec_root is None:
 
 ###
 
-# When running under valgrind, we mangle '-vg' or '-vg_leak' onto the end of the
-# triple so we can check it with XFAIL and XTARGET.
-config.target_triple += lit.valgrindTriple
+# Provide a target triple for mcjit tests
+mcjit_triple = config.target_triple
+# Force ELF format on Windows
+if re.search(r'cygwin|mingw32|win32', mcjit_triple):
+  mcjit_triple += "-elf"
+config.substitutions.append( ('%mcjit_triple', mcjit_triple) )
+
+# Provide a substition for those tests that need to run the jit to obtain data
+# but simply want use the currently considered most reliable jit for platform
+# FIXME: ppc32 is not ready for mcjit.
+if 'arm' in config.target_triple \
+   or 'powerpc64' in config.target_triple:
+    defaultIsMCJIT = 'true'
+else:
+    defaultIsMCJIT = 'false'
+config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) )
 
 # Process jit implementation option
 jit_impl_cfg = lit.params.get('jit_impl', None)
@@ -230,6 +244,10 @@ else:
 if loadable_module:
     config.available_features.add('loadable_module')
 
+# LTO on OS X
+if config.lto_is_enabled == "1" and platform.system() == "Darwin":
+    config.available_features.add('lto_on_osx')
+
 # llc knows whether he is compiled with -DNDEBUG.
 import subprocess
 try:
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 178b22f10f336..2bbe63e6348ec 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -11,6 +11,7 @@ config.python_executable = "@PYTHON_EXECUTABLE@"
 config.ocamlopt_executable = "@OCAMLOPT@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
+config.lto_is_enabled = "@LTO_IS_ENABLED@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_bindings = "@LLVM_BINDINGS@"
 config.host_os = "@HOST_OS@"
author	Dimitry Andric <dim@FreeBSD.org>	2012-12-02 13:10:19 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2012-12-02 13:10:19 +0000
commit	522600a229b950314b5f4af84eba4f3e8a0ffea1 (patch)
tree	32b4679ab4b8f28e5228daafc65e9dc436935353 /test
parent	902a7b529820e6a0aa85f98f21afaeb1805a22f8 (diff)