29 files changed, 1391 insertions, 599 deletions
diff --git a/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
new file mode 100644
index 000000000000..e10b04a850af
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
@@ -0,0 +1,139 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting)' -S | FileCheck %s
+
+; CHECK-LABEL: @test_simple
+; CHECK-LABEL: Header:
+; CHECK-NEXT: br i1 undef, label %Tail.predBB1.split
+; CHECK-LABEL: TBB:
+; CHECK: br i1 %cmp, label %Tail.predBB2.split
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_simple(i32* %a, i32 %v, i32 %p) {
+Header:
+  br i1 undef, label %Tail, label %End
+
+TBB:
+  %cmp = icmp eq i32* %a, null
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_eq_eq_eq_untaken
+; CHECK-LABEL: Header:
+; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB1, label %Tail
+
+TBB1:
+  %cmp1 = icmp eq i32 %v, 1
+  br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+  %cmp2 = icmp eq i32 %p, 99
+  br i1 %cmp2, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_eq_ne_eq_untaken
+; CHECK-LABEL: Header:
+; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB1, label %Tail
+
+TBB1:
+  %cmp1 = icmp ne i32 %v, 1
+  br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+  %cmp2 = icmp eq i32 %p, 99
+  br i1 %cmp2, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_header_header2_tbb
+; CHECK: Header2:
+; CHECK:br i1 %tobool2, label %Tail.predBB1.split, label %TBB1
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
+; CHECK-LABEL: Tail.predBB2.split:
+; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently
+;       only supports recording conditions along a single predecessor path.
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB1, label %Header2
+
+Header2:
+  %tobool2 = icmp eq i32 %p, 10
+  br i1 %tobool2, label %Tail, label %TBB1
+
+TBB1:
+  %cmp1 = icmp eq i32 %v, 1
+  br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+  %cmp2 = icmp eq i32 %p, 99
+  br i1 %cmp2, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+  ret i32 10
+}
diff --git a/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
new file mode 100644
index 000000000000..ca41bd6fc5e1
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting)' -S | FileCheck %s
+
+define i32 @callee(i32*, i32, i32) {
+  ret i32 10
+}
+
+; CHECK-LABEL: @test_preds_equal
+; CHECK-NOT: split
+; CHECK: br i1 %cmp, label %Tail, label %Tail
+define i32 @test_preds_equal(i32* %a, i32 %v, i32 %p) {
+TBB:
+  %cmp = icmp eq i32* %a, null
+  br i1 %cmp, label %Tail, label %Tail
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+}
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 4f3144e7fc73..30598ba7afbe 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -4,33 +4,59 @@ target triple = "x86_64-pc-linux-gnu"
 
 ; This tests that hot/cold functions get correct section prefix assigned
 
-; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; CHECK: hot_func1{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
 ; The entry is hot
-define void @hot_func() !prof !15 {
+define void @hot_func1() !prof !15 {
   ret void
 }
 
-; For instrumentation based PGO, we should only look at entry counts,
+; CHECK: hot_func2{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; Entry is cold but inner block is hot
+define void @hot_func2(i32 %n) !prof !16 {
+entry:
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %n, i32* %n.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end, !prof !19
+
+for.body:
+  %2 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+; For instrumentation based PGO, we should only look at block counts,
 ; not call site VP metadata (which can exist on value profiled memcpy,
 ; or possibly left behind after static analysis based devirtualization).
 ; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
 define void @cold_func1() !prof !16 {
-  call void @hot_func(), !prof !17
-  call void @hot_func(), !prof !17
+  call void @hot_func1(), !prof !17
+  call void @hot_func1(), !prof !17
   ret void
 }
 
-; CHECK: cold_func2{{.*}}!section_prefix
+; CHECK: cold_func2{{.*}}!section_prefix ![[COLD_ID]]
 define void @cold_func2() !prof !16 {
-  call void @hot_func(), !prof !17
-  call void @hot_func(), !prof !18
-  call void @hot_func(), !prof !18
+  call void @hot_func1(), !prof !17
+  call void @hot_func1(), !prof !18
+  call void @hot_func1(), !prof !18
   ret void
 }
 
 ; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]]
 define void @cold_func3() !prof !16 {
-  call void @hot_func(), !prof !18
+  call void @hot_func1(), !prof !18
   ret void
 }
 
@@ -55,3 +81,4 @@ define void @cold_func3() !prof !16 {
 !16 = !{!"function_entry_count", i64 1}
 !17 = !{!"branch_weights", i32 80}
 !18 = !{!"branch_weights", i32 1}
+!19 = !{!"branch_weights", i32 1000, i32 1}
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
index 7c05fda6cb8f..5cb4e0359970 100644
--- a/test/Transforms/GVN/tbaa.ll
+++ b/test/Transforms/GVN/tbaa.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
-; CHECK: @test1(i8* %p, i8* %q)
+; CHECK-LABEL: @test1(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) {
 }
 
 define i32 @test2(i8* %p, i8* %q) {
-; CHECK: @test2(i8* %p, i8* %q)
+; CHECK-LABEL: @test2(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
@@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) {
 }
 
 define i32 @test3(i8* %p, i8* %q) {
-; CHECK: @test3(i8* %p, i8* %q)
+; CHECK-LABEL: @test3(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !3
@@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) {
 }
 
 define i32 @test4(i8* %p, i8* %q) {
-; CHECK: @test4(i8* %p, i8* %q)
+; CHECK-LABEL: @test4(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !1
@@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) {
 }
 
 define i32 @test5(i8* %p, i8* %q) {
-; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !1
@@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) {
 }
 
 define i32 @test6(i8* %p, i8* %q) {
-; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !3
@@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) {
 }
 
 define i32 @test7(i8* %p, i8* %q) {
-; CHECK: @test7(i8* %p, i8* %q)
+; CHECK-LABEL: @test7(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) {
   ret i32 %c
 }
 
-
-
 define i32 @test8(i32* %p, i32* %q) {
-; CHECK-LABEL: test8
+; CHECK-LABEL: @test8
 ; CHECK-NEXT: store i32 15, i32* %p
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) {
   %c = sub i32 %a, %b
   ret i32 %c
 }
+
 define i32 @test9(i32* %p, i32* %q) {
-; CHECK-LABEL: test9
+; CHECK-LABEL: @test9
 ; CHECK-NEXT: call void @clobber()
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) {
   ret i32 %c
 }
 
+define i32 @test10(i8* %p, i8* %q) {
+; If one access encloses the other, then the merged access is the enclosed one
+; and not just the common final access type.
+; CHECK-LABEL: @test10
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]]
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !15  ; TAG_X_i
+  %b = call i32 @foo(i8* %p), !tbaa !19  ; TAG_Y_x_i
+  %c = add i32 %a, %b
+  ret i32 %c
+}
 
 declare void @clobber()
 declare i32 @foo(i8*) readonly
 
-; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
+; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
+; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
+; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
+!15 = !{!16, !17, i64 0}            ; TAG_X_i
+!16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"char", !2, i64 0}
 
-;; A TBAA structure who's only point is to have a constant location
+!19 = !{!20, !17, i64 0}            ; TAG_Y_x_i
+!20 = !{!"struct Y", !16, i64 0}    ; struct Y { struct X x; };
+
+; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
-
diff --git a/test/Transforms/Inline/AArch64/binop.ll b/test/Transforms/Inline/AArch64/binop.ll
new file mode 100644
index 000000000000..051528991e46
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/binop.ll
@@ -0,0 +1,291 @@
+; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -o - < %s -inline-threshold=0 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+declare void @pad()
+@glbl = external global i32
+
+define i32 @outer_add1(i32 %a) {
+; CHECK-LABEL: @outer_add1(
+; CHECK-NOT: call i32 @add
+  %C = call i32 @add(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @outer_add2(i32 %a) {
+; CHECK-LABEL: @outer_add2(
+; CHECK-NOT: call i32 @add
+  %C = call i32 @add(i32 0, i32 %a)
+  ret i32 %C
+}
+
+define i32 @add(i32 %a, i32 %b) {
+  %add = add i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %add
+}
+
+
+
+define i32 @outer_sub1(i32 %a) {
+; CHECK-LABEL: @outer_sub1(
+; CHECK-NOT: call i32 @sub1
+  %C = call i32 @sub1(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @sub1(i32 %a, i32 %b) {
+  %sub = sub i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %sub
+}
+
+
+define i32 @outer_sub2(i32 %a) {
+; CHECK-LABEL: @outer_sub2(
+; CHECK-NOT: call i32 @sub2
+  %C = call i32 @sub2(i32 %a)
+  ret i32 %C
+}
+
+define i32 @sub2(i32 %a) {
+  %sub = sub i32 %a, %a
+  call void @pad()
+  ret i32 %sub
+}
+
+
+
+define i32 @outer_mul1(i32 %a) {
+; CHECK-LABEL: @outer_mul1(
+; CHECK-NOT: call i32 @mul
+  %C = call i32 @mul(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @outer_mul2(i32 %a) {
+; CHECK-LABEL: @outer_mul2(
+; CHECK-NOT: call i32 @mul
+  %C = call i32 @mul(i32 %a, i32 1)
+  ret i32 %C
+}
+
+define i32 @mul(i32 %a, i32 %b) {
+  %mul = mul i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %mul
+}
+
+
+
+define i32 @outer_div1(i32 %a) {
+; CHECK-LABEL: @outer_div1(
+; CHECK-NOT: call i32 @div1
+  %C = call i32 @div1(i32 0, i32 %a)
+  ret i32 %C
+}
+
+define i32 @outer_div2(i32 %a) {
+; CHECK-LABEL: @outer_div2(
+; CHECK-NOT: call i32 @div1
+  %C = call i32 @div1(i32 %a, i32 1)
+  ret i32 %C
+}
+
+define i32 @div1(i32 %a, i32 %b) {
+  %div = sdiv i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %div
+}
+
+
+define i32 @outer_div3(i32 %a) {
+; CHECK-LABEL: @outer_div3(
+; CHECK-NOT: call i32 @div
+  %C = call i32 @div2(i32 %a)
+  ret i32 %C
+}
+
+define i32 @div2(i32 %a) {
+  %div = sdiv i32 %a, %a
+  call void @pad()
+  ret i32 %div
+}
+
+
+
+define i32 @outer_rem1(i32 %a) {
+; CHECK-LABEL: @outer_rem1(
+; CHECK-NOT: call i32 @rem
+  %C = call i32 @rem1(i32 0, i32 %a)
+  ret i32 %C
+}
+
+define i32 @outer_rem2(i32 %a) {
+; CHECK-LABEL: @outer_rem2(
+; CHECK-NOT: call i32 @rem
+  %C = call i32 @rem1(i32 %a, i32 1)
+  ret i32 %C
+}
+
+define i32 @rem1(i32 %a, i32 %b) {
+  %rem = urem i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %rem
+}
+
+
+define i32 @outer_rem3(i32 %a) {
+; CHECK-LABEL: @outer_rem3(
+; CHECK-NOT: call i32 @rem
+  %C = call i32 @rem2(i32 %a)
+  ret i32 %C
+}
+
+define i32 @rem2(i32 %a) {
+  %rem = urem i32 %a, %a
+  call void @pad()
+  ret i32 %rem
+}
+
+
+
+define i32 @outer_shl1(i32 %a) {
+; CHECK-LABEL: @outer_shl1(
+; CHECK-NOT: call i32 @shl
+  %C = call i32 @shl(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @shl(i32 %a, i32 %b) {
+  %shl = shl i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %shl
+}
+
+
+
+define i32 @outer_shr1(i32 %a) {
+; CHECK-LABEL: @outer_shr1(
+; CHECK-NOT: call i32 @shr
+  %C = call i32 @shr(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @shr(i32 %a, i32 %b) {
+  %shr = ashr i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %shr
+}
+
+
+
+define i1 @outer_and1(i1 %a) {
+; check-label: @outer_and1(
+; check-not: call i1 @and1
+  %c = call i1 @and1(i1 %a, i1 false)
+  ret i1 %c
+}
+
+define i1 @outer_and2(i1 %a) {
+; check-label: @outer_and2(
+; check-not: call i1 @and1
+  %c = call i1 @and1(i1 %a, i1 true)
+  ret i1 %c
+}
+
+define i1 @and1(i1 %a, i1 %b) {
+  %and = and i1 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i1 %and
+}
+
+
+define i1 @outer_and3(i1 %a) {
+; check-label: @outer_and3(
+; check-not: call i1 @and2
+  %c = call i1 @and2(i1 %a)
+  ret i1 %c
+}
+
+define i1 @and2(i1 %a) {
+  %and = and i1 %a, %a
+  call void @pad()
+  ret i1 %and
+}
+
+
+
+define i1 @outer_or1(i1 %a) {
+; check-label: @outer_or1(
+; check-not: call i1 @or1
+  %c = call i1 @or1(i1 %a, i1 false)
+  ret i1 %c
+}
+
+define i1 @outer_or2(i1 %a) {
+; check-label: @outer_or2(
+; check-not: call i1 @or1
+  %c = call i1 @or1(i1 %a, i1 true)
+  ret i1 %c
+}
+
+define i1 @or1(i1 %a, i1 %b) {
+  %or = or i1 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i1 %or
+}
+
+
+define i1 @outer_or3(i1 %a) {
+; check-label: @outer_or3(
+; check-not: call i1 @or2
+  %c = call i1 @or2(i1 %a)
+  ret i1 %c
+}
+
+define i1 @or2(i1 %a) {
+  %or = or i1 %a, %a
+  call void @pad()
+  ret i1 %or
+}
+
+
+
+define i1 @outer_xor1(i1 %a) {
+; check-label: @outer_xor1(
+; check-not: call i1 @xor
+  %c = call i1 @xor1(i1 %a, i1 false)
+  ret i1 %c
+}
+
+define i1 @xor1(i1 %a, i1 %b) {
+  %xor = xor i1 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i1 %xor
+}
+
+
+define i1 @outer_xor3(i1 %a) {
+; check-label: @outer_xor3(
+; check-not: call i1 @xor
+  %c = call i1 @xor2(i1 %a)
+  ret i1 %c
+}
+
+define i1 @xor2(i1 %a) {
+  %xor = xor i1 %a, %a
+  call void @pad()
+  ret i1 %xor
+}
diff --git a/test/Transforms/Inline/ARM/inline-fp.ll b/test/Transforms/Inline/ARM/inline-fp.ll
new file mode 100644
index 000000000000..b4e76dfc7d2d
--- /dev/null
+++ b/test/Transforms/Inline/ARM/inline-fp.ll
@@ -0,0 +1,113 @@
+; RUN: opt -S -inline -mtriple=arm-eabi -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=NOFP
+; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2 -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=FULLFP
+; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2,+fp-only-sp -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=SINGLEFP
+; Make sure that soft float implementations are calculated as being more expensive
+; to the inliner.
+
+; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+; FULLFP-DAG: single inlined into test_single with cost=0 (threshold=75)
+; FULLFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; FULLFP-DAG: double inlined into test_double with cost=0 (threshold=75)
+; FULLFP-DAG: double inlined into test_double with cost=-15000 (threshold=75)
+; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+; SINGLEFP-DAG: single inlined into test_single with cost=0 (threshold=75)
+; SINGLEFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+define i32 @test_single(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+  %call = call float @single(i32 %a, i8 zeroext %b)
+  %call2 = call float @single(i32 %c, i8 zeroext %d)
+  ret i32 0
+}
+
+define i32 @test_single_cheap(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+  %call = call float @single_cheap(i32 %a, i8 zeroext %b)
+  %call2 = call float @single_cheap(i32 %c, i8 zeroext %d)
+  ret i32 0
+}
+
+define i32 @test_double(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+  %call = call double @double(i32 %a, i8 zeroext %b)
+  %call2 = call double @double(i32 %c, i8 zeroext %d)
+  ret i32 0
+}
+
+define i32 @test_single_force_soft(i32 %a, i8 %b, i32 %c, i8 %d) #1 {
+  %call = call float @single_force_soft(i32 %a, i8 zeroext %b) #1
+  %call2 = call float @single_force_soft(i32 %c, i8 zeroext %d) #1
+  ret i32 0
+}
+
+define internal float @single(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+define internal float @single_cheap(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = bitcast i32 %sub to float
+  %conv2 = bitcast i32 %response to float
+  %0 = tail call float @llvm.pow.f32(float %conv2, float %conv1)
+  %1 = tail call float @llvm.pow.f32(float %0, float %0)
+  %2 = tail call float @llvm.pow.f32(float %1, float %1)
+  ret float %2
+}
+
+define internal double @double(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to double
+  %0 = tail call double @llvm.pow.f64(double 0x3FF028F5C0000000, double %conv1)
+  %mul = fmul double %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to double
+  %sub3 = fsub double %conv2, %mul
+  %div = fdiv double %sub3, %mul
+  ret double %div
+}
+
+define internal float @single_force_soft(i32 %response, i8 zeroext %value1) #1 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+declare float @llvm.pow.f32(float, float) optsize minsize
+declare double @llvm.pow.f64(double, double) optsize minsize
+
+attributes #0 = { optsize }
+attributes #1 = { optsize "use-soft-float"="true" "target-features"="+soft-float" }
diff --git a/test/Transforms/Inline/inline-fp.ll b/test/Transforms/Inline/inline-fp.ll
deleted file mode 100644
index dd5972fe1b8a..000000000000
--- a/test/Transforms/Inline/inline-fp.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-; RUN: opt -S -inline < %s | FileCheck %s
-; RUN: opt -S -passes='cgscc(inline)' < %s | FileCheck %s
-; Make sure that soft float implementations are calculated as being more expensive
-; to the inliner.
-
-define i32 @test_nofp() #0 {
-; f_nofp() has the "use-soft-float" attribute, so it should never get inlined.
-; CHECK-LABEL: test_nofp
-; CHECK: call float @f_nofp 
-entry:
-  %responseX = alloca i32, align 4
-  %responseY = alloca i32, align 4
-  %responseZ = alloca i32, align 4
-  %valueX = alloca i8, align 1
-  %valueY = alloca i8, align 1
-  %valueZ = alloca i8, align 1
-
-  call void @getX(i32* %responseX, i8* %valueX)
-  call void @getY(i32* %responseY, i8* %valueY)
-  call void @getZ(i32* %responseZ, i8* %valueZ)
-
-  %0 = load i32, i32* %responseX
-  %1 = load i8, i8* %valueX
-  %call = call float @f_nofp(i32 %0, i8 zeroext %1)
-  %2 = load i32, i32* %responseZ
-  %3 = load i8, i8* %valueZ
-  %call2 = call float @f_nofp(i32 %2, i8 zeroext %3)
-  %call3 = call float @fabsf(float %call)
-  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
-  br i1 %cmp, label %if.end12, label %if.else
-
-if.else:                                          ; preds = %entry
-  %4 = load i32, i32* %responseY
-  %5 = load i8, i8* %valueY
-  %call1 = call float @f_nofp(i32 %4, i8 zeroext %5)
-  %call4 = call float @fabsf(float %call1)
-  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
-  br i1 %cmp5, label %if.end12, label %if.else7
-
-if.else7:                                         ; preds = %if.else
-  %call8 = call float @fabsf(float %call2)
-  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
-  br i1 %cmp9, label %if.then10, label %if.end12
-
-if.then10:                                        ; preds = %if.else7
-  br label %if.end12
-
-if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
-  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
-  ret i32 %success.0
-}
-
-define i32 @test_hasfp() #0 {
-; f_hasfp()  does not have the "use-soft-float" attribute, so it should get inlined.
-; CHECK-LABEL: test_hasfp
-; CHECK-NOT: call float @f_hasfp 
-entry:
-  %responseX = alloca i32, align 4
-  %responseY = alloca i32, align 4
-  %responseZ = alloca i32, align 4
-  %valueX = alloca i8, align 1
-  %valueY = alloca i8, align 1
-  %valueZ = alloca i8, align 1
-
-  call void @getX(i32* %responseX, i8* %valueX)
-  call void @getY(i32* %responseY, i8* %valueY)
-  call void @getZ(i32* %responseZ, i8* %valueZ)
-
-  %0 = load i32, i32* %responseX
-  %1 = load i8, i8* %valueX
-  %call = call float @f_hasfp(i32 %0, i8 zeroext %1)
-  %2 = load i32, i32* %responseZ
-  %3 = load i8, i8* %valueZ
-  %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3)
-  %call3 = call float @fabsf(float %call)
-  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
-  br i1 %cmp, label %if.end12, label %if.else
-
-if.else:                                          ; preds = %entry
-  %4 = load i32, i32* %responseY
-  %5 = load i8, i8* %valueY
-  %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5)
-  %call4 = call float @fabsf(float %call1)
-  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
-  br i1 %cmp5, label %if.end12, label %if.else7
-
-if.else7:                                         ; preds = %if.else
-  %call8 = call float @fabsf(float %call2)
-  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
-  br i1 %cmp9, label %if.then10, label %if.end12
-
-if.then10:                                        ; preds = %if.else7
-  br label %if.end12
-
-if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
-  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
-  ret i32 %success.0
-}
-
-declare void @getX(i32*, i8*) #0
-
-declare void @getY(i32*, i8*) #0
-
-declare void @getZ(i32*, i8*) #0
-
-define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 {
-entry:
-  %conv = zext i8 %value1 to i32
-  %sub = add nsw i32 %conv, -1
-  %conv1 = sitofp i32 %sub to float
-  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
-  %mul = fmul float %0, 2.620000e+03
-  %conv2 = sitofp i32 %response to float
-  %sub3 = fsub float %conv2, %mul
-  %div = fdiv float %sub3, %mul
-  ret float %div
-}
-
-define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 {
-entry:
-  %conv = zext i8 %value1 to i32
-  %sub = add nsw i32 %conv, -1
-  %conv1 = sitofp i32 %sub to float
-  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
-  %mul = fmul float %0, 2.620000e+03
-  %conv2 = sitofp i32 %response to float
-  %sub3 = fsub float %conv2, %mul
-  %div = fdiv float %sub3, %mul
-  ret float %div
-}
-
-declare float @fabsf(float) optsize minsize
-
-declare float @llvm.pow.f32(float, float) optsize minsize
-
-attributes #0 = { optsize }
-attributes #1 = { optsize "use-soft-float"="true" }
diff --git a/test/Transforms/Inline/redundant-loads.ll b/test/Transforms/Inline/redundant-loads.ll
index 6b89f1db484b..176f605fc73b 100644
--- a/test/Transforms/Inline/redundant-loads.ll
+++ b/test/Transforms/Inline/redundant-loads.ll
@@ -184,3 +184,21 @@ define void @inner9(i32* %a, void ()* %f) {
   call void @pad()
   ret void
 }
+
+
+define void @outer10(i32* %a) {
+; CHECK-LABEL: @outer10(
+; CHECK: call void @inner10
+  %b = alloca i32
+  call void @inner10(i32* %a, i32* %b)
+  ret void
+}
+
+define void @inner10(i32* %a, i32* %b) {
+  %1 = load i32, i32* %a
+  store i32 %1, i32 * %b
+  %2 = load volatile i32, i32* %a ; volatile load should be kept.
+  call void @pad()
+  %3 = load volatile i32, i32* %a ; Same as the above.
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
index 1833558cbceb..7a315094a04e 100644
--- a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
+++ b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
@@ -5,18 +5,18 @@ declare i8* @llvm.adjust.trampoline(i8*)
 declare i32 @f(i8 * nest, i32)
 
 ; Most common case
-define i32 @test0(i32 %n) {
+define i32 @test0(i32 %n) !dbg !4 {
   %alloca = alloca [10 x i8], align 16
   %gep = getelementptr [10 x i8], [10 x i8]* %alloca, i32 0, i32 0
   call void @llvm.init.trampoline(i8* %gep, i8* bitcast (i32 (i8*, i32)* @f to i8*),
                                   i8* null)
   %tramp = call i8* @llvm.adjust.trampoline(i8* %gep)
   %function = bitcast i8* %tramp to i32(i32)*
-  %ret = call i32 %function(i32 %n)
+  %ret = call i32 %function(i32 %n), !dbg !10
   ret i32 %ret
 
-; CHECK: define i32 @test0(i32 %n) {
-; CHECK: %ret = call i32 @f(i8* nest null, i32 %n)
+; CHECK: define i32 @test0(i32 %n) !dbg !4 {
+; CHECK: %ret = call i32 @f(i8* nest null, i32 %n), !dbg !10
 }
 
 define i32 @test1(i32 %n, i8* %trampmem) {
@@ -85,3 +85,18 @@ define i32 @test4(i32 %n) {
 ; CHECK: %ret1 = call i32 @f(i8* nest null, i32 %n)
 ; CHECK: %ret2 = call i32 @f(i8* nest null, i32 %n)
 }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.0 (trunk 127710)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
+!1 = !DIFile(filename: "string.h", directory: "Game")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "passthru", scope: !1, file: !1, line: 79, type: !5, isLocal: true, isDefinition: true, scopeLine: 79, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: null, size: 64, align: 64)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 78, type: !7)
+!10 = !DILocation(line: 78, column: 28, scope: !4)
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
index 53175a7b7253..c760283f9e52 100644
--- a/test/Transforms/JumpThreading/guards.ll
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -278,3 +278,106 @@ L2:
 L3:
   ret void
 }
+
+; Make sure that we don't PRE a non-speculable load across a guard.
+define void @unsafe_pre_across_guard(i8* %p, i1 %load.is.valid) {
+
+; CHECK-LABEL: @unsafe_pre_across_guard(
+; CHECK-NOT:   loaded.pr
+; CHECK:       entry:
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+; CHECK-NEXT:    %loaded = load i8, i8* %p
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; Make sure that we can safely PRE a speculable load across a guard.
+define void @safe_pre_across_guard(i8* noalias nocapture readonly dereferenceable(8) %p, i1 %load.is.valid) {
+
+; CHECK-LABEL: @safe_pre_across_guard(
+; CHECK:       entry:
+; CHECK-NEXT:    %loaded.pr = load i8, i8* %p
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; Make sure that we don't PRE a non-speculable load across a call which may
+; alias with the load.
+define void @unsafe_pre_across_call(i8* %p) {
+
+; CHECK-LABEL: @unsafe_pre_across_call(
+; CHECK-NOT:   loaded.pr
+; CHECK:       entry:
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    call i32 @f1()
+; CHECK-NEXT:    %loaded = load i8, i8* %p
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call i32 @f1()
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; Make sure that we can safely PRE a speculable load across a call.
+define void @safe_pre_across_call(i8* noalias nocapture readonly dereferenceable(8) %p) {
+
+; CHECK-LABEL: @safe_pre_across_call(
+; CHECK:       entry:
+; CHECK-NEXT:    %loaded.pr = load i8, i8* %p
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ]
+; CHECK-NEXT:    call i32 @f1()
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call i32 @f1()
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/legal_preheader_check.ll b/test/Transforms/LoopVectorize/legal_preheader_check.ll
new file mode 100644
index 000000000000..32aa796394d6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/legal_preheader_check.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -debug -S -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; D40973
+; Make sure LV legal bails out when the loop doesn't have a legal pre-header.
+
+; CHECK: LV: Loop doesn't have a legal pre-header.
+
+define void @inc(i32 %n, i8* %P) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %BB1, label %BB2
+
+BB1:
+  indirectbr i8* %P, [label %.lr.ph]
+
+BB2:
+  br label %.lr.ph
+
+.lr.ph:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %BB1 ], [ 0, %BB2 ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:
+  ret void
+}
diff --git a/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
new file mode 100644
index 000000000000..e3d1f6dd2b17
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Test memcpy-memcpy dependencies across invoke edges.
+
+; Test that memcpyopt works across the non-unwind edge of an invoke.
+
+define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %temp = alloca i8, i32 64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+  invoke void @invoke_me()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  landingpad { i8*, i32 }
+          catch i8* null
+  ret void
+
+try.cont:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+  ret void
+}
+
+; Test that memcpyopt works across the unwind edge of an invoke.
+
+define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %temp = alloca i8, i32 64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+  invoke void @invoke_me()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  landingpad { i8*, i32 }
+          catch i8* null
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+  ret void
+
+try.cont:
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare i32 @__gxx_personality_v0(...)
+declare void @invoke_me() readnone
diff --git a/test/Transforms/MemCpyOpt/merge-into-memset.ll b/test/Transforms/MemCpyOpt/merge-into-memset.ll
new file mode 100644
index 000000000000..fc31038a4e6d
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/merge-into-memset.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Update cached non-local dependence information when merging stores into memset.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Don't delete the memcpy in %if.then, even though it depends on an instruction
+; which will be deleted.
+
+; CHECK-LABEL: @foo
+define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) {
+entry:
+  %tmp = alloca [50 x i8], align 8
+  %tmp4 = bitcast [50 x i8]* %tmp to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp4, i64 1
+  call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
+  store i8 0, i8* %tmp4, align 8, !dbg !5
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %tmp1, i8* nonnull %d, i64 10, i32 1, i1 false)
+  br i1 %c, label %if.then, label %exit
+
+if.then:
+; CHECK: if.then:
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.rs", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !DILocation(line: 8, column: 5, scope: !6)
+!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !7, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
diff --git a/test/Transforms/MemCpyOpt/mixed-sizes.ll b/test/Transforms/MemCpyOpt/mixed-sizes.ll
new file mode 100644
index 000000000000..9091fe7f56c0
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Handle memcpy-memcpy dependencies of differing sizes correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Don't delete the second memcpy, even though there's an earlier
+; memcpy with a larger size from the same address.
+
+; CHECK-LABEL: @foo
+define i32 @foo(i1 %z) {
+entry:
+  %a = alloca [10 x i32]
+  %s = alloca [10 x i32]
+  %0 = bitcast [10 x i32]* %a to i8*
+  %1 = bitcast [10 x i32]* %s to i8*
+  call void @llvm.memset.p0i8.i64(i8* nonnull %1, i8 0, i64 40, i32 16, i1 false)
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 0
+  store i32 1, i32* %arrayidx
+  %scevgep = getelementptr [10 x i32], [10 x i32]* %s, i64 0, i64 1
+  %scevgep7 = bitcast i32* %scevgep to i8*
+  br i1 %z, label %for.body3.lr.ph, label %for.inc7.1
+
+for.body3.lr.ph:                                  ; preds = %entry
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 17179869180, i32 4, i1 false)
+  br label %for.inc7.1
+
+for.inc7.1:
+; CHECK: for.inc7.1:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
+  %2 = load i32, i32* %arrayidx
+  ret i32 %2
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
diff --git a/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
new file mode 100644
index 000000000000..5b0510211d9f
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Make sure memcpy-memcpy dependence is optimized across
+; basic blocks (conditional branches and invokes).
+
+%struct.s = type { i32, i32 }
+
+@s_foo = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
+@s_baz = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
+@i = external constant i8*
+
+declare void @qux()
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare i8* @__cxa_begin_catch(i8*)
+
+; A simple partial redundancy. Test that the second memcpy is optimized
+; to copy directly from the original source rather than from the temporary.
+
+; CHECK-LABEL: @wobble
+define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) {
+bb:
+  %temp = alloca i8, i32 64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+  br i1 %some_condition, label %more, label %out
+
+out:
+  call void @qux()
+  unreachable
+
+more:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+  ret void
+}
+
+; A CFG triangle with a partial redundancy targeting an alloca. Test that the
+; memcpy inside the triangle is optimized to copy directly from the original
+; source rather than from the temporary.
+
+; CHECK-LABEL: @foo
+define i32 @foo(i1 %t3) {
+bb:
+  %s = alloca %struct.s, align 4
+  %t = alloca %struct.s, align 4
+  %s1 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+  br i1 %t3, label %bb4, label %bb7
+
+bb4:                                              ; preds = %bb
+  %t5 = bitcast %struct.s* %t to i8*
+  %s6 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* %s6, i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+  br label %bb7
+
+bb7:                                              ; preds = %bb4, %bb
+  %t8 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 0
+  %t9 = load i32, i32* %t8, align 4
+  %t10 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 1
+  %t11 = load i32, i32* %t10, align 4
+  %t12 = add i32 %t9, %t11
+  ret i32 %t12
+}
+
+; A CFG diamond with an invoke on one side, and a partially redundant memcpy
+; into an alloca on the other. Test that the memcpy inside the diamond is
+; optimized to copy ; directly from the original source rather than from the
+; temporary. This more complex test represents a relatively common usage
+; pattern.
+
+; CHECK-LABEL: @baz
+define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+bb:
+  %s = alloca %struct.s, align 4
+  %t = alloca %struct.s, align 4
+  %s3 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+  br i1 %t5, label %bb6, label %bb22
+
+bb6:                                              ; preds = %bb
+  invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+          to label %bb25 unwind label %bb9
+
+bb9:                                              ; preds = %bb6
+  %t10 = landingpad { i8*, i32 }
+          catch i8* null
+  br label %bb13
+
+bb13:                                             ; preds = %bb9
+  %t15 = call i8* @__cxa_begin_catch(i8* null)
+  br label %bb23
+
+bb22:                                             ; preds = %bb
+  %t23 = bitcast %struct.s* %t to i8*
+  %s24 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* %s24, i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+  br label %bb23
+
+bb23:                                             ; preds = %bb22, %bb13
+  %t17 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 0
+  %t18 = load i32, i32* %t17, align 4
+  %t19 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 1
+  %t20 = load i32, i32* %t19, align 4
+  %t21 = add nsw i32 %t18, %t20
+  ret i32 %t21
+
+bb25:                                             ; preds = %bb6
+  unreachable
+}
diff --git a/test/Transforms/NewGVN/tbaa.ll b/test/Transforms/NewGVN/tbaa.ll
index 3dcc4f8acc14..d48ededac03a 100644
--- a/test/Transforms/NewGVN/tbaa.ll
+++ b/test/Transforms/NewGVN/tbaa.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -tbaa -basicaa -newgvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
-; CHECK: @test1(i8* %p, i8* %q)
+; CHECK-LABEL: @test1(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) {
 }
 
 define i32 @test2(i8* %p, i8* %q) {
-; CHECK: @test2(i8* %p, i8* %q)
+; CHECK-LABEL: @test2(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
@@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) {
 }
 
 define i32 @test3(i8* %p, i8* %q) {
-; CHECK: @test3(i8* %p, i8* %q)
+; CHECK-LABEL: @test3(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !3
@@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) {
 }
 
 define i32 @test4(i8* %p, i8* %q) {
-; CHECK: @test4(i8* %p, i8* %q)
+; CHECK-LABEL: @test4(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !1
@@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) {
 }
 
 define i32 @test5(i8* %p, i8* %q) {
-; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !1
@@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) {
 }
 
 define i32 @test6(i8* %p, i8* %q) {
-; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !3
@@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) {
 }
 
 define i32 @test7(i8* %p, i8* %q) {
-; CHECK: @test7(i8* %p, i8* %q)
+; CHECK-LABEL: @test7(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) {
   ret i32 %c
 }
 
-
-
 define i32 @test8(i32* %p, i32* %q) {
-; CHECK-LABEL: test8
+; CHECK-LABEL: @test8
 ; CHECK-NEXT: store i32 15, i32* %p
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) {
   %c = sub i32 %a, %b
   ret i32 %c
 }
+
 define i32 @test9(i32* %p, i32* %q) {
-; CHECK-LABEL: test9
+; CHECK-LABEL: @test9
 ; CHECK-NEXT: call void @clobber()
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) {
   ret i32 %c
 }
 
+define i32 @test10(i8* %p, i8* %q) {
+; If one access encloses the other, then the merged access is the enclosed one
+; and not just the common final access type.
+; CHECK-LABEL: @test10
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]]
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !15  ; TAG_X_i
+  %b = call i32 @foo(i8* %p), !tbaa !19  ; TAG_Y_x_i
+  %c = add i32 %a, %b
+  ret i32 %c
+}
 
 declare void @clobber()
 declare i32 @foo(i8*) readonly
 
-; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
+; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
+; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
+; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
+!15 = !{!16, !17, i64 0}            ; TAG_X_i
+!16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"char", !2, i64 0}
 
-;; A TBAA structure who's only point is to have a constant location
+!19 = !{!20, !17, i64 0}            ; TAG_Y_x_i
+!20 = !{!"struct Y", !16, i64 0}    ; struct Y { struct X x; };
+
+; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
-
diff --git a/test/Transforms/PGOProfile/icp_covariant_call_return.ll b/test/Transforms/PGOProfile/icp_covariant_call_return.ll
index fc5054e3a574..aba075461deb 100644
--- a/test/Transforms/PGOProfile/icp_covariant_call_return.ll
+++ b/test/Transforms/PGOProfile/icp_covariant_call_return.ll
@@ -22,8 +22,7 @@ entry:
   %vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8
   %vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0
   %tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8
-; ICALL-PROM:  [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8*
-; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*)
+; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*)
 ; ICALL-PROM:  br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM:if.true.direct_targ:
 ; ICALL-PROM:  [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D*
diff --git a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
index d2ff47dda0e6..0a4444783eb0 100644
--- a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
+++ b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
@@ -32,18 +32,19 @@ invoke.cont:
   %vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8
   %vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0
   %tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8
-; ICALL-PROM:  [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8*
-; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*)
+; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*)
 ; ICALL-PROM:  br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM:if.true.direct_targ:
 ; ICALL-PROM:  [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D*
 ; ICALL-PROM:  [[DIRCALL_RET:%[0-9]+]] = invoke %struct.Derived* @_ZN1D4funcEv(%struct.D* [[ARG_BITCAST]])
-; ICALL-PROM:          to label %if.end.icp unwind label %lpad
+; ICALL-PROM:          to label %if.true.direct_targ.if.end.icp_crit_edge unwind label %lpad
+; ICALL-PROM:if.true.direct_targ.if.end.icp_crit_edge:
+; ICALL-PROM:  [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base*
+; ICALL-PROM:  br label %if.end.icp
 ; ICALL-PROM:if.false.orig_indirect:
 ; ICAll-PROM:  %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1)
 ; ICAll-PROM:          to label %invoke.cont1 unwind label %lpad
 ; ICALL-PROM:if.end.icp:
-; ICALL-PROM:  [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base*
 ; ICALL-PROM:  br label %invoke.cont1
   %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1)
           to label %invoke.cont1 unwind label %lpad, !prof !1
diff --git a/test/Transforms/PGOProfile/icp_invoke.ll b/test/Transforms/PGOProfile/icp_invoke.ll
index 2ec564627aa1..1cacc1bc1aca 100644
--- a/test/Transforms/PGOProfile/icp_invoke.ll
+++ b/test/Transforms/PGOProfile/icp_invoke.ll
@@ -20,8 +20,7 @@ entry:
 define i32 @_Z3goov() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   %tmp = load void ()*, void ()** @foo1, align 8
-; ICP:  [[BITCAST_IC1:%[0-9]+]] = bitcast void ()* %tmp to i8*
-; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (void ()* @_ZL4bar1v to i8*)
+; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq void ()* %tmp, @_ZL4bar1v
 ; ICP:  br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICP:[[TRUE_LABEL_IC1]]:
 ; ICP:  invoke void @_ZL4bar1v()
@@ -49,17 +48,19 @@ catch:
 
 try.cont:
   %tmp6 = load i32 ()*, i32 ()** @foo2, align 8
-; ICP:  [[BITCAST_IC2:%[0-9]+]] = bitcast i32 ()* %tmp6 to i8*
-; ICP:  [[CMP_IC2:%[0-9]+]] = icmp eq i8* [[BITCAST_IC2]], bitcast (i32 ()* @_ZL4bar2v to i8*)
+; ICP:  [[CMP_IC2:%[0-9]+]] = icmp eq i32 ()* %tmp6, @_ZL4bar2v
 ; ICP:  br i1 [[CMP_IC2]], label %[[TRUE_LABEL_IC2:.*]], label %[[FALSE_LABEL_IC2:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICP:[[TRUE_LABEL_IC2]]:
-; ICP:  [[RESULT_IC2:%[0-9]+]] = invoke i32 @_ZL4bar2v()
-; ICP:          to label %[[DCALL_NORMAL_DEST_IC2:.*]] unwind label %lpad1
+; ICP:  [[RESULT_IC2_0:%[0-9]+]] = invoke i32 @_ZL4bar2v()
+; ICP:          to label %[[MERGE_BB:.*]] unwind label %lpad1
 ; ICP:[[FALSE_LABEL_IC2]]:
+; ICP:  [[RESULT_IC2_1:%.+]] = invoke i32 %tmp6()
+; ICP:          to label %[[MERGE_BB]] unwind label %lpad1
   %call = invoke i32 %tmp6()
           to label %try.cont8 unwind label %lpad1, !prof !3
 
-; ICP:[[DCALL_NORMAL_DEST_IC2]]:
+; ICP:[[MERGE_BB]]:
+; ICP:  [[MERGE_PHI:%.+]] = phi i32 [ [[RESULT_IC2_1]], %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2_0]], %[[TRUE_LABEL_IC2]] ]
 ; ICP:  br label %try.cont8
 lpad1:
   %tmp7 = landingpad { i8*, i32 }
@@ -77,7 +78,7 @@ catch6:
 
 try.cont8:
   %i.0 = phi i32 [ undef, %catch6 ], [ %call, %try.cont ]
-; ICP:  %i.0 = phi i32 [ undef, %catch6 ], [ %call, %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2]], %[[DCALL_NORMAL_DEST_IC2]] ]
+; ICP:  %i.0 = phi i32 [ undef, %catch6 ], [ [[MERGE_PHI]], %[[MERGE_BB]] ]
   ret i32 %i.0
 
 eh.resume:
diff --git a/test/Transforms/PGOProfile/icp_invoke_nouse.ll b/test/Transforms/PGOProfile/icp_invoke_nouse.ll
index 5a1e6358cb61..096d2e0f222e 100644
--- a/test/Transforms/PGOProfile/icp_invoke_nouse.ll
+++ b/test/Transforms/PGOProfile/icp_invoke_nouse.ll
@@ -18,8 +18,7 @@ entry:
 
 if.end:                                           ; preds = %entry
   %fptr = load i32 ()*, i32 ()** @pfptr, align 8
-; ICP:  [[BITCAST_IC1:%[0-9]+]] = bitcast i32 ()* %fptr to i8*
-; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (i32 ()* @_ZL4bar1v to i8*)
+; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq i32 ()* %fptr, @_ZL4bar1v
 ; ICP:  br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICP:[[TRUE_LABEL_IC1]]:
 ; ICP:  invoke i32 @_ZL4bar1v()
diff --git a/test/Transforms/PGOProfile/icp_vararg.ll b/test/Transforms/PGOProfile/icp_vararg.ll
index 400aab3aead7..ec243470290a 100644
--- a/test/Transforms/PGOProfile/icp_vararg.ll
+++ b/test/Transforms/PGOProfile/icp_vararg.ll
@@ -13,8 +13,7 @@ entry:
 define i32 @bar() #1 {
 entry:
   %tmp = load i32 (i32, ...)*, i32 (i32, ...)** @foo, align 8
-; ICALL-PROM:  [[BITCAST:%[0-9]+]] = bitcast i32 (i32, ...)* %tmp to i8*
-; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 (i32, ...)* @va_func to i8*)
+; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i32 (i32, ...)* %tmp, @va_func
 ; ICALL-PROM:  br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM:if.true.direct_targ:
 ; ICALL-PROM:  [[DIRCALL_RET:%[0-9]+]] = call i32 (i32, ...) @va_func(i32 3, i32 12, i32 22, i32 4)
diff --git a/test/Transforms/PGOProfile/indirect_call_promotion.ll b/test/Transforms/PGOProfile/indirect_call_promotion.ll
index 6832fecfaed3..85df5260f199 100644
--- a/test/Transforms/PGOProfile/indirect_call_promotion.ll
+++ b/test/Transforms/PGOProfile/indirect_call_promotion.ll
@@ -43,8 +43,7 @@ entry:
 define i32 @bar() {
 entry:
   %tmp = load i32 ()*, i32 ()** @foo, align 8
-; ICALL-PROM:   [[BITCAST:%[0-9]+]] = bitcast i32 ()* %tmp to i8*
-; ICALL-PROM:   [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 ()* @func4 to i8*)
+; ICALL-PROM:   [[CMP:%[0-9]+]] = icmp eq i32 ()* %tmp, @func4
 ; ICALL-PROM:   br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM: if.true.direct_targ:
 ; ICALL-PROM:   [[DIRCALL_RET:%[0-9]+]] = call i32 @func4()
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
index 4def8ce561c0..557a83a75626 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -11,16 +11,20 @@
     define i32 @fn1() {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 8, i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP7]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
deleted file mode 100644
index 5fc0298b6cef..000000000000
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
-
-
-;void jumble (int * restrict A, int * restrict B) {
-  ;  int tmp0 = A[10]*A[0];
-  ;  int tmp1 = A[11]*A[1];
-  ;  int tmp2 = A[12]*A[3];
-  ;  int tmp3 = A[13]*A[2];
-  ;  B[0] = tmp0;
-  ;  B[1] = tmp1;
-  ;  B[2] = tmp2;
-  ;  B[3] = tmp3;
-  ;}
-
-
-  ; Function Attrs: norecurse nounwind uwtable
-  define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
-; CHECK-LABEL: @jumble1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load i32, i32* %A, align 4
-  %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
-  %2 = load i32, i32* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
-  %3 = load i32, i32* %arrayidx3, align 4
-  %mul4 = mul nsw i32 %2, %3
-  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
-  %4 = load i32, i32* %arrayidx5, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
-  %5 = load i32, i32* %arrayidx6, align 4
-  %mul7 = mul nsw i32 %4, %5
-  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
-  %6 = load i32, i32* %arrayidx8, align 4
-  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
-  %7 = load i32, i32* %arrayidx9, align 4
-  %mul10 = mul nsw i32 %6, %7
-  store i32 %mul, i32* %B, align 4
-  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
-  store i32 %mul4, i32* %arrayidx12, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
-  store i32 %mul7, i32* %arrayidx13, align 4
-  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
-  store i32 %mul10, i32* %arrayidx14, align 4
-  ret void
-  }
-
-;Reversing the operand of MUL
-  ; Function Attrs: norecurse nounwind uwtable
-  define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
-; CHECK-LABEL: @jumble2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load i32, i32* %A, align 4
-  %mul = mul nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
-  %2 = load i32, i32* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
-  %3 = load i32, i32* %arrayidx3, align 4
-  %mul4 = mul nsw i32 %3, %2
-  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
-  %4 = load i32, i32* %arrayidx5, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
-  %5 = load i32, i32* %arrayidx6, align 4
-  %mul7 = mul nsw i32 %5, %4
-  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
-  %6 = load i32, i32* %arrayidx8, align 4
-  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
-  %7 = load i32, i32* %arrayidx9, align 4
-  %mul10 = mul nsw i32 %7, %6
-  store i32 %mul, i32* %B, align 4
-  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
-  store i32 %mul4, i32* %arrayidx12, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
-  store i32 %mul7, i32* %arrayidx13, align 4
-  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
-  store i32 %mul10, i32* %arrayidx14, align 4
-  ret void
-  }
-
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
deleted file mode 100644
index 568fd9f3ac79..000000000000
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
-
-;void phiUsingLoads(int *restrict A, int *restrict B) {
-;  int tmp0, tmp1, tmp2, tmp3;
-;  for (int i = 0; i < 100; i++) {
-;    if (A[0] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 2];
-;      tmp3 = A[i + 3];
-;    } else if (A[25] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 2];
-;      tmp3 = A[i + 3];
-;    } else if (A[50] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 2];
-;      tmp3 = A[i + 3];
-;    } else if (A[75] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 3];
-;      tmp3 = A[i + 2];
-;    }
-;  }
-;  B[0] = tmp0;
-;  B[1] = tmp1;
-;  B[2] = tmp2;
-;  B[3] = tmp3;
-;}
-
-
-; Function Attrs: norecurse nounwind uwtable
-define void @phiUsingLoads(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) local_unnamed_addr #0 {
-; CHECK-LABEL: @phiUsingLoads(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 25
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 50
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 75
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[CMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[CMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE27:%.*]]
-; CHECK:       if.then14:
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX17]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else27:
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[CMP29:%.*]] = icmp eq i32 [[TMP14]], 0
-; CHECK-NEXT:    br i1 [[CMP29]], label [[IF_THEN30:%.*]], label [[IF_ELSE43:%.*]]
-; CHECK:       if.then30:
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX33]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else43:
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[CMP45:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[CMP45]], label [[IF_THEN46:%.*]], label [[FOR_INC]]
-; CHECK:       if.then46:
-; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-;
-entry:
-  %0 = load i32, i32* %A, align 4
-  %cmp1 = icmp eq i32 %0, 0
-  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 25
-  %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 50
-  %arrayidx44 = getelementptr inbounds i32, i32* %A, i64 75
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc
-  store i32 %tmp0.1, i32* %B, align 4
-  %arrayidx64 = getelementptr inbounds i32, i32* %B, i64 1
-  store i32 %tmp1.1, i32* %arrayidx64, align 4
-  %arrayidx65 = getelementptr inbounds i32, i32* %B, i64 2
-  store i32 %tmp2.1, i32* %arrayidx65, align 4
-  %arrayidx66 = getelementptr inbounds i32, i32* %B, i64 3
-  store i32 %tmp3.1, i32* %arrayidx66, align 4
-  ret void
-
-for.body:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %tmp3.0111 = phi i32 [ undef, %entry ], [ %tmp3.1, %for.inc ]
-  %tmp2.0110 = phi i32 [ undef, %entry ], [ %tmp2.1, %for.inc ]
-  %tmp1.0109 = phi i32 [ undef, %entry ], [ %tmp1.1, %for.inc ]
-  %tmp0.0108 = phi i32 [ undef, %entry ], [ %tmp0.1, %for.inc ]
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
-  %2 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %2
-  %3 = load i32, i32* %arrayidx5, align 4
-  %4 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %4
-  %5 = load i32, i32* %arrayidx8, align 4
-  %6 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %6
-  %7 = load i32, i32* %arrayidx11, align 4
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %8 = load i32, i32* %arrayidx12, align 4
-  %cmp13 = icmp eq i32 %8, 0
-  br i1 %cmp13, label %if.then14, label %if.else27
-
-if.then14:                                        ; preds = %if.else
-  %arrayidx17 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %9 = load i32, i32* %arrayidx17, align 4
-  %10 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %10
-  %11 = load i32, i32* %arrayidx20, align 4
-  %12 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx23 = getelementptr inbounds i32, i32* %A, i64 %12
-  %13 = load i32, i32* %arrayidx23, align 4
-  %14 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx26 = getelementptr inbounds i32, i32* %A, i64 %14
-  %15 = load i32, i32* %arrayidx26, align 4
-  br label %for.inc
-
-if.else27:                                        ; preds = %if.else
-  %16 = load i32, i32* %arrayidx28, align 4
-  %cmp29 = icmp eq i32 %16, 0
-  br i1 %cmp29, label %if.then30, label %if.else43
-
-if.then30:                                        ; preds = %if.else27
-  %arrayidx33 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %17 = load i32, i32* %arrayidx33, align 4
-  %18 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx36 = getelementptr inbounds i32, i32* %A, i64 %18
-  %19 = load i32, i32* %arrayidx36, align 4
-  %20 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx39 = getelementptr inbounds i32, i32* %A, i64 %20
-  %21 = load i32, i32* %arrayidx39, align 4
-  %22 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx42 = getelementptr inbounds i32, i32* %A, i64 %22
-  %23 = load i32, i32* %arrayidx42, align 4
-  br label %for.inc
-
-if.else43:                                        ; preds = %if.else27
-  %24 = load i32, i32* %arrayidx44, align 4
-  %cmp45 = icmp eq i32 %24, 0
-  br i1 %cmp45, label %if.then46, label %for.inc
-
-if.then46:                                        ; preds = %if.else43
-  %arrayidx49 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %25 = load i32, i32* %arrayidx49, align 4
-  %26 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx52 = getelementptr inbounds i32, i32* %A, i64 %26
-  %27 = load i32, i32* %arrayidx52, align 4
-  %28 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx55 = getelementptr inbounds i32, i32* %A, i64 %28
-  %29 = load i32, i32* %arrayidx55, align 4
-  %30 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx58 = getelementptr inbounds i32, i32* %A, i64 %30
-  %31 = load i32, i32* %arrayidx58, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then30, %if.else43, %if.then46, %if.then14
-  %tmp0.1 = phi i32 [ %1, %if.then ], [ %9, %if.then14 ], [ %17, %if.then30 ], [ %25, %if.then46 ], [ %tmp0.0108, %if.else43 ]
-  %tmp1.1 = phi i32 [ %3, %if.then ], [ %11, %if.then14 ], [ %19, %if.then30 ], [ %27, %if.then46 ], [ %tmp1.0109, %if.else43 ]
-  %tmp2.1 = phi i32 [ %5, %if.then ], [ %13, %if.then14 ], [ %21, %if.then30 ], [ %29, %if.then46 ], [ %tmp2.0110, %if.else43 ]
-  %tmp3.1 = phi i32 [ %7, %if.then ], [ %15, %if.then14 ], [ %23, %if.then30 ], [ %31, %if.then46 ], [ %tmp3.0111, %if.else43 ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index be58521ed898..06e051a90b0d 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -5,27 +5,34 @@
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
-; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
index 6ae763520013..1b2c76384e0b 100644
--- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -6,26 +6,33 @@
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SampleProfile/entry_counts.ll b/test/Transforms/SampleProfile/entry_counts.ll
index 6137a6908cf5..cab7c87e0493 100644
--- a/test/Transforms/SampleProfile/entry_counts.ll
+++ b/test/Transforms/SampleProfile/entry_counts.ll
@@ -9,8 +9,8 @@ entry:
   ret void, !dbg !9
 }
 
-; This function does not have profile, check if function_entry_count is 0
-; CHECK: {{.*}} = !{!"function_entry_count", i64 0}
+; This function does not have profile, check if function_entry_count is -1
+; CHECK: {{.*}} = !{!"function_entry_count", i64 -1}
 define void @no_profile() {
 entry:
   ret void
diff --git a/test/Transforms/SimplifyCFG/X86/if-conversion.ll b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
new file mode 100644
index 000000000000..28702572d480
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
@@ -0,0 +1,231 @@
+; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s
+; Avoid if-conversion if there is a long dependence chain.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; The first several cases test FindLongDependenceChain returns true, so
+; if-conversion is blocked.
+
+define i64 @test1(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-NOT: select
+}
+
+define i64 @test2(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = add i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: select
+}
+
+; The following cases test FindLongDependenceChain returns false, so
+; if-conversion will proceed.
+
+; Non trivial LatencyAdjustment.
+define i64 @test3(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %p4 = add i64 %p3, %1
+  %ptr = inttoptr i64 %p4 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test3
+; CHECK: select
+}
+
+; Short dependence chain.
+define i64 @test4(i64* %pp, i64* %p) {
+entry:
+  %0 = load i64, i64* %pp, align 8
+  %cmp = icmp slt i64 %0, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test4
+; CHECK: select
+}
+
+; High IPC.
+define i64 @test5(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = or i64 %pint, 16
+  %5 = and i64 %pint, 255
+
+  %6 = or i64 %2, 9
+  %7 = and i64 %3, 255
+  %8 = add i64 %4, 4
+  %9 = add i64 %5, 5
+
+  %10 = add i64 %6, 2
+  %11 = add i64 %7, 3
+  %12 = add i64 %8, 4
+  %13 = add i64 %9, 5
+
+  %14 = add i64 %10, 6
+  %15 = add i64 %11, 7
+  %16 = add i64 %12, 8
+  %17 = add i64 %13, 9
+
+  %18 = add i64 %14, 10
+  %19 = add i64 %15, 11
+  %20 = add i64 %16, 12
+  %21 = add i64 %17, 13
+
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+
+  ret i64 %val
+
+; CHECK-LABEL: @test5
+; CHECK: select
+}
+
+; Large BB size.
+define i64 @test6(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = add i64 %2, 4
+  %5 = add i64 %3, 5
+  %6 = add i64 %4, 6
+  %7 = add i64 %5, 7
+  %8 = add i64 %6, 6
+  %9 = add i64 %7, 7
+  %10 = add i64 %8, 6
+  %11 = add i64 %9, 7
+  %12 = add i64 %10, 6
+  %13 = add i64 %11, 7
+  %14 = add i64 %12, 6
+  %15 = add i64 %13, 7
+  %16 = add i64 %14, 6
+  %17 = add i64 %15, 7
+  %18 = add i64 %16, 6
+  %19 = add i64 %17, 7
+  %20 = add i64 %18, 6
+  %21 = add i64 %19, 7
+  %22 = add i64 %20, 6
+  %23 = add i64 %21, 7
+  %24 = add i64 %22, 6
+  %25 = add i64 %23, 7
+  %26 = add i64 %24, 6
+  %27 = add i64 %25, 7
+  %28 = add i64 %26, 6
+  %29 = add i64 %27, 7
+  %30 = add i64 %28, 6
+  %31 = add i64 %29, 7
+  %32 = add i64 %30, 8
+  %33 = add i64 %31, 9
+  %34 = add i64 %32, %33
+  %35 = and i64 %34, 255
+  %res = add i64 %val, %35
+
+  ret i64 %res
+
+; CHECK-LABEL: @test6
+; CHECK: select
+}