43 files changed, 1999 insertions, 128 deletions
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
new file mode 100644
index 000000000000..328e8cc2907f
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -0,0 +1,337 @@
+; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown < %s   | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
+
+define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp2(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+  ret i32 %call
+}
+
+define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp3(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+  ret i32 %call
+}
+
+define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp4(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+  ret i32 %call
+}
+
+define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp5(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+  ret i32 %call
+}
+
+define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp6(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+  ret i32 %call
+}
+
+define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp7(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+  ret i32 %call
+}
+
+define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp8(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+  ret i32 %call
+}
+
+define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp9(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+  ret i32 %call
+}
+
+define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp10(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+  ret i32 %call
+}
+
+define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp11(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+  ret i32 %call
+}
+
+define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp12(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+  ret i32 %call
+}
+
+define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp13(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+  ret i32 %call
+}
+
+define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp14(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+  ret i32 %call
+}
+
+define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp15(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+  ret i32 %call
+}
+
+define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp16(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+  ret i32 %call
+}
+
+define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq2(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq3(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq4(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq5(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq6(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq7(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq8(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq9(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq10(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq11(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq12(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq13(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq14(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq15(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq16(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
diff --git a/test/Transforms/ConstProp/sse.ll b/test/Transforms/ConstProp/sse.ll
index cc37c96c1ff1..ad0a62e42062 100644
--- a/test/Transforms/ConstProp/sse.ll
+++ b/test/Transforms/ConstProp/sse.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -constprop -S | FileCheck %s
-; REQUIRES: x86
+; REQUIRES: x86-registered-target
 
 define i1 @test_sse_cvts_exact() nounwind readnone {
 ; CHECK-LABEL: @test_sse_cvts_exact(
diff --git a/test/Transforms/DCE/calls-errno.ll b/test/Transforms/DCE/calls-errno.ll
index 22ea04aa8f36..415caae0fe60 100644
--- a/test/Transforms/DCE/calls-errno.ll
+++ b/test/Transforms/DCE/calls-errno.ll
@@ -72,6 +72,10 @@ entry:
 ; CHECK-NEXT: %cos2 = call double @cos(double 0x7FF0000000000000)
   %cos2 = call double @cos(double 0x7FF0000000000000)
 
+; cos(0) nobuiltin may have side effects 
+; CHECK-NEXT: %cos3 = call double @cos(double 0.000000e+00)
+  %cos3 = call double @cos(double 0.000000e+00) nobuiltin
+
 ; pow(0, 1) is 0
   %pow1 = call double @pow(double 0x7FF0000000000000, double 1.000000e+00)
 
diff --git a/test/Transforms/GVNSink/sink-common-code.ll b/test/Transforms/GVNSink/sink-common-code.ll
index d9e757cd10fc..02b1eb7fe259 100644
--- a/test/Transforms/GVNSink/sink-common-code.ll
+++ b/test/Transforms/GVNSink/sink-common-code.ll
@@ -54,33 +54,36 @@ if.end:
 
 declare i32 @foo(i32, i32) nounwind readnone
 
-define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
-entry:
-  br i1 %flag, label %if.then, label %if.else
-
-if.then:
-  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
-  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
-  br label %if.end
-
-if.else:
-  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
-  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
-  br label %if.end
-
-if.end:
-  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
-  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
-  %ret = add i32 %xx, %yy
-  ret i32 %ret
-}
-
-; CHECK-LABEL: test3
-; CHECK: select
-; CHECK: call
-; CHECK: call
-; CHECK: add
-; CHECK-NOT: br
+; FIXME: The test failes when the original order of the
+; candidates with the same cost is preserved.
+;
+;define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
+;entry:
+;  br i1 %flag, label %if.then, label %if.else
+;
+;if.then:
+;  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+;  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+;  br label %if.end
+;
+;if.else:
+;  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+;  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+;  br label %if.end
+;
+;if.end:
+;  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+;  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
+;  %ret = add i32 %xx, %yy
+;  ret i32 %ret
+;}
+;
+; -CHECK-LABEL: test3
+; -CHECK: select
+; -CHECK: call
+; -CHECK: call
+; -CHECK: add
+; -CHECK-NOT: br
 
 define i32 @test4(i1 zeroext %flag, i32 %x, i32* %y) {
 entry:
diff --git a/test/Transforms/IRCE/correct-loop-info.ll b/test/Transforms/IRCE/correct-loop-info.ll
new file mode 100644
index 000000000000..3c26b47f154f
--- /dev/null
+++ b/test/Transforms/IRCE/correct-loop-info.ll
@@ -0,0 +1,182 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -irce < %s -S | FileCheck %s
+
+; REQUIRES: asserts
+
+; IRCE creates the pre and post loop, and invokes the
+; canonicalizing these loops to LCSSA and loop-simplfy structure. Make sure that the update to the loopinfo does not
+; incorrectly change the header while canonicalizing these pre/post loops. We
+; were incorrectly updating LI when the split loop is a subloop as in the case below.
+source_filename = "correct-loop-info.ll"
+
+define void @baz() personality i32* ()* @ham {
+; CHECK-LABEL: @baz(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[OUTERHEADER:%.*]]
+; CHECK:       outerheader:
+; CHECK-NEXT:    [[TMP:%.*]] = icmp slt i32 undef, 84
+; CHECK-NEXT:    br i1 [[TMP]], label [[BB2:%.*]], label [[BB16:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br i1 false, label [[INNERHEADER_PRELOOP_PREHEADER:%.*]], label [[PRELOOP_PSEUDO_EXIT:%.*]]
+; CHECK:       innerheader.preloop.preheader:
+; CHECK-NEXT:    br label [[INNERHEADER_PRELOOP:%.*]]
+; CHECK:       mainloop:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i32 [[INDVAR_END:%.*]], -1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[INNERHEADER_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       innerheader.preheader:
+; CHECK-NEXT:    br label [[INNERHEADER:%.*]]
+; CHECK:       innerheader:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP6:%.*]], [[BB8:%.*]] ], [ [[TMP4_PRELOOP_COPY:%.*]], [[INNERHEADER_PREHEADER]] ]
+; CHECK-NEXT:    invoke void @pluto()
+; CHECK-NEXT:    to label [[BB5:%.*]] unwind label %outer_exiting.loopexit.split-lp.loopexit.split-lp
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP6]] = add i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 true, label [[BB8]], label [[EXIT3_LOOPEXIT5:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP6]], 84
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP6]], -1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[INNERHEADER]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[TMP6_LCSSA:%.*]] = phi i32 [ [[TMP6]], [[BB8]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP6_LCSSA]], 84
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MAIN_PSEUDO_EXIT]], label [[BB13:%.*]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[TMP4_COPY:%.*]] = phi i32 [ [[TMP4_PRELOOP_COPY]], [[MAINLOOP:%.*]] ], [ [[TMP6_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END1:%.*]] = phi i32 [ [[INDVAR_END]], [[MAINLOOP]] ], [ [[TMP6_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       outer_exiting.loopexit:
+; CHECK-NEXT:    [[LPAD_LOOPEXIT:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    br label [[OUTER_EXITING:%.*]]
+; CHECK:       outer_exiting.loopexit.split-lp.loopexit:
+; CHECK-NEXT:    [[LPAD_LOOPEXIT2:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    br label %outer_exiting.loopexit.split-lp
+; CHECK:       outer_exiting.loopexit.split-lp.loopexit.split-lp:
+; CHECK-NEXT:    %lpad.loopexit.split-lp3 = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    br label %outer_exiting.loopexit.split-lp
+; CHECK:       outer_exiting.loopexit.split-lp:
+; CHECK-NEXT:    br label [[OUTER_EXITING]]
+; CHECK:       outer_exiting:
+; CHECK-NEXT:    switch i32 undef, label [[EXIT2:%.*]] [
+; CHECK-NEXT:    i32 142, label [[BB14:%.*]]
+; CHECK-NEXT:    i32 448, label [[EXIT:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       exit3.loopexit:
+; CHECK-NEXT:    br label [[EXIT3:%.*]]
+; CHECK:       exit3.loopexit4:
+; CHECK-NEXT:    br label [[EXIT3]]
+; CHECK:       exit3.loopexit5:
+; CHECK-NEXT:    br label [[EXIT3]]
+; CHECK:       exit3:
+; CHECK-NEXT:    ret void
+; CHECK:       bb13.loopexit:
+; CHECK-NEXT:    br label [[BB13]]
+; CHECK:       bb13:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb14:
+; CHECK-NEXT:    br label [[OUTERHEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       bb16:
+; CHECK-NEXT:    ret void
+; CHECK:       exit2:
+; CHECK-NEXT:    ret void
+; CHECK:       innerheader.preloop:
+; CHECK-NEXT:    [[TMP4_PRELOOP:%.*]] = phi i32 [ [[TMP6_PRELOOP:%.*]], [[BB8_PRELOOP:%.*]] ], [ undef, [[INNERHEADER_PRELOOP_PREHEADER]] ]
+; CHECK-NEXT:    invoke void @pluto()
+; CHECK-NEXT:    to label [[BB5_PRELOOP:%.*]] unwind label [[OUTER_EXITING_LOOPEXIT:%.*]]
+; CHECK:       bb5.preloop:
+; CHECK-NEXT:    [[TMP6_PRELOOP]] = add i32 [[TMP4_PRELOOP]], 1
+; CHECK-NEXT:    [[TMP7_PRELOOP:%.*]] = icmp ult i32 [[TMP6_PRELOOP]], 0
+; CHECK-NEXT:    br i1 [[TMP7_PRELOOP]], label [[BB8_PRELOOP]], label [[EXIT3_LOOPEXIT:%.*]]
+; CHECK:       bb8.preloop:
+; CHECK-NEXT:    [[TMP9_PRELOOP:%.*]] = icmp slt i32 [[TMP6_PRELOOP]], 84
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP6_PRELOOP]], -1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[INNERHEADER_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop !0, !irce.loop.clone !5
+; CHECK:       preloop.exit.selector:
+; CHECK-NEXT:    [[TMP6_PRELOOP_LCSSA:%.*]] = phi i32 [ [[TMP6_PRELOOP]], [[BB8_PRELOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP6_PRELOOP_LCSSA]], 84
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRELOOP_PSEUDO_EXIT]], label [[BB13]]
+; CHECK:       preloop.pseudo.exit:
+; CHECK-NEXT:    [[TMP4_PRELOOP_COPY]] = phi i32 [ undef, [[BB2]] ], [ [[TMP6_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END]] = phi i32 [ undef, [[BB2]] ], [ [[TMP6_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[MAINLOOP]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[INNERHEADER_POSTLOOP:%.*]]
+; CHECK:       innerheader.postloop:
+; CHECK-NEXT:    [[TMP4_POSTLOOP:%.*]] = phi i32 [ [[TMP6_POSTLOOP:%.*]], [[BB8_POSTLOOP:%.*]] ], [ [[TMP4_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    invoke void @pluto()
+; CHECK-NEXT:    to label [[BB5_POSTLOOP:%.*]] unwind label %outer_exiting.loopexit.split-lp.loopexit
+; CHECK:       bb5.postloop:
+; CHECK-NEXT:    [[TMP6_POSTLOOP]] = add i32 [[TMP4_POSTLOOP]], 1
+; CHECK-NEXT:    [[TMP7_POSTLOOP:%.*]] = icmp ult i32 [[TMP6_POSTLOOP]], 0
+; CHECK-NEXT:    br i1 [[TMP7_POSTLOOP]], label [[BB8_POSTLOOP]], label [[EXIT3_LOOPEXIT4:%.*]]
+; CHECK:       bb8.postloop:
+; CHECK-NEXT:    [[TMP9_POSTLOOP:%.*]] = icmp slt i32 [[TMP6_POSTLOOP]], 84
+; CHECK-NEXT:    br i1 [[TMP9_POSTLOOP]], label [[INNERHEADER_POSTLOOP]], label [[BB13_LOOPEXIT:%.*]], !llvm.loop !6, !irce.loop.clone !5
+;
+bb:
+  br label %outerheader
+
+outerheader:                                              ; preds = %bb14, %bb
+  %tmp = icmp slt i32 undef, 84
+  br i1 %tmp, label %bb2, label %bb16
+
+bb2:                                              ; preds = %outerheader
+  br label %innerheader
+
+innerheader:                                              ; preds = %bb8, %bb2
+  %tmp4 = phi i32 [ %tmp6, %bb8 ], [ undef, %bb2 ]
+  invoke void @pluto()
+  to label %bb5 unwind label %outer_exiting
+
+bb5:                                              ; preds = %innerheader
+  %tmp6 = add i32 %tmp4, 1
+  %tmp7 = icmp ult i32 %tmp6, 0
+  br i1 %tmp7, label %bb8, label %exit3
+
+bb8:                                              ; preds = %bb5
+  %tmp9 = icmp slt i32 %tmp6, 84
+  br i1 %tmp9, label %innerheader, label %bb13
+
+outer_exiting:                                             ; preds = %innerheader
+  %tmp11 = landingpad { i8*, i32 }
+  cleanup
+  switch i32 undef, label %exit2 [
+  i32 142, label %bb14
+  i32 448, label %exit
+  ]
+
+exit3:                                             ; preds = %bb5
+  ret void
+
+bb13:                                             ; preds = %bb8
+  unreachable
+
+bb14:                                             ; preds = %outer_exiting
+  br label %outerheader
+
+exit:                                             ; preds = %outer_exiting
+  ret void
+
+bb16:                                             ; preds = %outerheader
+  ret void
+
+exit2:                                             ; preds = %outer_exiting
+  ret void
+}
+
+declare i32* @ham()
+
+declare void @pluto()
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.unroll.disable"}
+!2 = !{!"llvm.loop.vectorize.enable", i1 false}
+!3 = !{!"llvm.loop.licm_versioning.disable"}
+!4 = !{!"llvm.loop.distribute.enable", i1 false}
+!5 = !{}
+!6 = distinct !{!6, !1, !2, !3, !4}
diff --git a/test/Transforms/IndVarSimplify/lftr_disabled.ll b/test/Transforms/IndVarSimplify/lftr_disabled.ll
new file mode 100644
index 000000000000..c647d123dd75
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/lftr_disabled.ll
@@ -0,0 +1,28 @@
+; LFTR should not eliminate the need for the computation of i*i completely
+; due to LFTR is disabled.
+; RUN: opt < %s -indvars -dce -disable-lftr -S | FileCheck %s
+
+; Provide legal integer types.
+target datalayout = "n8:16:32:64"
+
+
+@A = external global i32                ; <i32*> [#uses=1]
+
+define i32 @quadratic_setlt() {
+; CHECK-LABEL: @quadratic_setlt(
+; CHECK: mul
+entry:
+        br label %loop
+
+loop:           ; preds = %loop, %entry
+        %i = phi i32 [ 7, %entry ], [ %i.next, %loop ]          ; <i32> [#uses=5]
+        %i.next = add i32 %i, 1         ; <i32> [#uses=1]
+        store i32 %i, i32* @A
+        %i2 = mul i32 %i, %i            ; <i32> [#uses=1]
+        %c = icmp slt i32 %i2, 1000             ; <i1> [#uses=1]
+        br i1 %c, label %loop, label %loopexit
+
+loopexit:               ; preds = %loop
+        ret i32 %i
+}
+
diff --git a/test/Transforms/InferAddressSpaces/NVPTX/clone_constexpr.ll b/test/Transforms/InferAddressSpaces/NVPTX/clone_constexpr.ll
new file mode 100644
index 000000000000..1b3240620571
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/NVPTX/clone_constexpr.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -infer-address-spaces %s | FileCheck %s
+
+%struct.S = type { [5 x i32] }
+
+$g1 = comdat any
+
+@g1 = linkonce_odr addrspace(3) global %struct.S zeroinitializer, comdat, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK:  %x0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+; CHECK:  %idxprom.i = zext i32 %x0 to i64
+; CHECK:  %arrayidx.i = getelementptr %struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 %idxprom.i
+; CHECK:  tail call void @f1(i32* %arrayidx.i, i32 undef) #0
+; CHECK:  %x1 = load i32, i32* getelementptr (%struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 0), align 4
+; CHECK:  %L.sroa.0.0.insert.ext.i = zext i32 %x1 to i64
+; CHECK:  tail call void @f2(i64* null, i64 %L.sroa.0.0.insert.ext.i) #0
+; CHECK:  ret void
+define void @foo() local_unnamed_addr #0 {
+entry:
+  %x0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+  %idxprom.i = zext i32 %x0 to i64
+  %arrayidx.i = getelementptr %struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 %idxprom.i
+  tail call void @f1(i32* %arrayidx.i, i32 undef) #0
+  %x1 = load i32, i32* getelementptr (%struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 0), align 4
+  %L.sroa.0.0.insert.ext.i = zext i32 %x1 to i64
+  tail call void @f2(i64* null, i64 %L.sroa.0.0.insert.ext.i) #0
+  ret void
+}
+
+declare void @f1(i32*, i32) local_unnamed_addr #0
+declare void @f2(i64*, i64) local_unnamed_addr #0
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/Transforms/Inline/basictest.ll b/test/Transforms/Inline/basictest.ll
index b98644cd2dd4..f34ed0841132 100644
--- a/test/Transforms/Inline/basictest.ll
+++ b/test/Transforms/Inline/basictest.ll
@@ -91,3 +91,27 @@ define i32 @test() {
   ret i32 %e
 ; CHECK: }
 }
+
+; Inliner shouldn't delete calls it can't inline, even if they're trivially dead
+; CHECK-LABEL: @outer4(
+define void @outer4(void ()* %inner4) {
+entry:
+; CHECK: call void %inner4()
+  call void %inner4() nounwind readnone
+  ret void
+}
+
+declare void @inner5_inner()
+
+define void @inner5(void ()* %x) {
+  call void %x() nounwind readnone
+  ret void
+}
+
+; Inliner shouldn't delete calls it can't inline, even if they're trivially dead and temporarily indirect
+; CHECK-LABEL: @outer5(
+define void @outer5() {
+; CHECK: call void @inner5_inner(
+  call void @inner5(void ()* @inner5_inner)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/constant-fold-libfunc.ll b/test/Transforms/InstCombine/constant-fold-libfunc.ll
new file mode 100644
index 000000000000..c969b65a4e74
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-fold-libfunc.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare double @acos(double)
+
+; Check that functions without any function attributes are simplified.
+
+define double @test_simplify_acos() {
+; CHECK-LABEL: @test_simplify_acos
+  %pi = call double @acos(double -1.000000e+00)
+; CHECK-NOT: call double @acos
+; CHECK: ret double 0x400921FB54442D18
+  ret double %pi
+}
+
+define double @test_acos_nobuiltin() {
+; CHECK-LABEL: @test_acos_nobuiltin
+  %pi = call double @acos(double -1.000000e+00) nobuiltin 
+; CHECK: call double @acos(double -1.000000e+00)
+  ret double %pi
+}
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 29f774c5f62b..fb25c2342798 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -260,3 +260,26 @@ bb2:
   %ins2 = insertelement <4 x float> %ins1, float %ext1, i32 3
   ret <4 x float> %ins2
 }
+
+; Don't insert extractelements from the wider vector before the def of the index operand.
+
+define <4 x i32> @extractelt_insertion(<2 x i32> %x, i32 %y) {
+; CHECK-LABEL: @extractelt_insertion(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, <4 x i32> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[Y:%.*]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 [[C]]
+; CHECK-NEXT:    [[E:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[E]], <4 x i32> [[B]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[RET]]
+;
+entry:
+  %a = extractelement <2 x i32> %x, i32 1
+  %b = insertelement <4 x i32> zeroinitializer, i32 %a, i64 3
+  %c = add i32 %y, 3
+  %d = extractelement <2 x i32> %x, i32 %c
+  %e = icmp eq i32 %d, 0
+  %ret = select i1 %e, <4 x i32> %b, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 78c98955353e..1b1ed606868f 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -21,6 +21,7 @@ declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
 declare double @llvm.cos.f64(double %Val) nounwind readonly
 declare double @llvm.sin.f64(double %Val) nounwind readonly
 declare double @llvm.floor.f64(double %Val) nounwind readonly
@@ -282,6 +283,16 @@ define i32 @cttz(i32 %a) {
   ret i32 %count
 }
 
+define <2 x i32> @cttz_vec(<2 x i32> %a) {
+; CHECK-LABEL: @cttz_vec(
+; CHECK-NEXT:    ret <2 x i32> <i32 3, i32 3>
+;
+  %or = or <2 x i32> %a, <i32 8, i32 8>
+  %and = and <2 x i32> %or, <i32 -8, i32 -8>
+  %count = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %and, i1 true) nounwind readnone
+  ret <2 x i32> %count
+}
+
 define i1 @cttz_knownbits(i32 %arg) {
 ; CHECK-LABEL: @cttz_knownbits(
 ; CHECK-NEXT:    ret i1 false
@@ -292,6 +303,16 @@ define i1 @cttz_knownbits(i32 %arg) {
   ret i1 %res
 }
 
+define <2 x i1> @cttz_knownbits_vec(<2 x i32> %arg) {
+; CHECK-LABEL: @cttz_knownbits_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %or = or <2 x i32> %arg, <i32 4, i32 4>
+  %cnt = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %or, i1 true) nounwind readnone
+  %res = icmp eq <2 x i32> %cnt, <i32 4, i32 4>
+  ret <2 x i1> %res
+}
+
 define i1 @cttz_knownbits2(i32 %arg) {
 ; CHECK-LABEL: @cttz_knownbits2(
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG:%.*]], 4
@@ -305,6 +326,19 @@ define i1 @cttz_knownbits2(i32 %arg) {
   ret i1 %res
 }
 
+define <2 x i1> @cttz_knownbits2_vec(<2 x i32> %arg) {
+; CHECK-LABEL: @cttz_knownbits2_vec(
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG:%.*]], <i32 4, i32 4>
+; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[OR]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i32> [[CNT]], <i32 2, i32 2>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %or = or <2 x i32> %arg, <i32 4, i32 4>
+  %cnt = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %or, i1 true) nounwind readnone
+  %res = icmp eq <2 x i32> %cnt, <i32 2, i32 2>
+  ret <2 x i1> %res
+}
+
 ; TODO: The icmp is unnecessary given the known bits of the input.
 define i1 @cttz_knownbits3(i32 %arg) {
 ; CHECK-LABEL: @cttz_knownbits3(
@@ -319,6 +353,20 @@ define i1 @cttz_knownbits3(i32 %arg) {
   ret i1 %res
 }
 
+; TODO: The icmp is unnecessary given the known bits of the input.
+define <2 x i1> @cttz_knownbits3_vec(<2 x i32> %arg) {
+; CHECK-LABEL: @cttz_knownbits3_vec(
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG:%.*]], <i32 4, i32 4>
+; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[OR]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i32> [[CNT]], <i32 3, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %or = or <2 x i32> %arg, <i32 4, i32 4>
+  %cnt = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %or, i1 true) nounwind readnone
+  %res = icmp eq <2 x i32> %cnt, <i32 3, i32 3>
+  ret <2 x i1> %res
+}
+
 define i8 @ctlz(i8 %a) {
 ; CHECK-LABEL: @ctlz(
 ; CHECK-NEXT:    ret i8 2
@@ -329,6 +377,16 @@ define i8 @ctlz(i8 %a) {
   ret i8 %count
 }
 
+define <2 x i8> @ctlz_vec(<2 x i8> %a) {
+; CHECK-LABEL: @ctlz_vec(
+; CHECK-NEXT:    ret <2 x i8> <i8 2, i8 2>
+;
+  %or = or <2 x i8> %a, <i8 32, i8 32>
+  %and = and <2 x i8> %or, <i8 63, i8 63>
+  %count = tail call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %and, i1 true) nounwind readnone
+  ret <2 x i8> %count
+}
+
 define i1 @ctlz_knownbits(i8 %arg) {
 ; CHECK-LABEL: @ctlz_knownbits(
 ; CHECK-NEXT:    ret i1 false
@@ -339,6 +397,16 @@ define i1 @ctlz_knownbits(i8 %arg) {
   ret i1 %res
 }
 
+define <2 x i1> @ctlz_knownbits_vec(<2 x i8> %arg) {
+; CHECK-LABEL: @ctlz_knownbits_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %or = or <2 x i8> %arg, <i8 32, i8 32>
+  %cnt = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %or, i1 true) nounwind readnone
+  %res = icmp eq <2 x i8> %cnt, <i8 4, i8 4>
+  ret <2 x i1> %res
+}
+
 define i1 @ctlz_knownbits2(i8 %arg) {
 ; CHECK-LABEL: @ctlz_knownbits2(
 ; CHECK-NEXT:    [[OR:%.*]] = or i8 [[ARG:%.*]], 32
@@ -352,6 +420,19 @@ define i1 @ctlz_knownbits2(i8 %arg) {
   ret i1 %res
 }
 
+define <2 x i1> @ctlz_knownbits2_vec(<2 x i8> %arg) {
+; CHECK-LABEL: @ctlz_knownbits2_vec(
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i8> [[ARG:%.*]], <i8 32, i8 32>
+; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[OR]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i8> [[CNT]], <i8 2, i8 2>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %or = or <2 x i8> %arg, <i8 32, i8 32>
+  %cnt = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %or, i1 true) nounwind readnone
+  %res = icmp eq <2 x i8> %cnt, <i8 2, i8 2>
+  ret <2 x i1> %res
+}
+
 ; TODO: The icmp is unnecessary given the known bits of the input.
 define i1 @ctlz_knownbits3(i8 %arg) {
 ; CHECK-LABEL: @ctlz_knownbits3(
@@ -366,6 +447,20 @@ define i1 @ctlz_knownbits3(i8 %arg) {
   ret i1 %res
 }
 
+; TODO: The icmp is unnecessary given the known bits of the input.
+define <2 x i1> @ctlz_knownbits3_vec(<2 x i8> %arg) {
+; CHECK-LABEL: @ctlz_knownbits3_vec(
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i8> [[ARG:%.*]], <i8 32, i8 32>
+; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[OR]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq <2 x i8> [[CNT]], <i8 3, i8 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %or = or <2 x i8> %arg, <i8 32, i8 32>
+  %cnt = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %or, i1 true) nounwind readnone
+  %res = icmp eq <2 x i8> %cnt, <i8 3, i8 3>
+  ret <2 x i1> %res
+}
+
 define void @cmp.simplify(i32 %a, i32 %b, i1* %c) {
   %lz = tail call i32 @llvm.ctlz.i32(i32 %a, i1 false) nounwind readnone
   %lz.cmp = icmp eq i32 %lz, 32
@@ -406,7 +501,7 @@ define <2 x i1> @cttz_cmp_vec(<2 x i32> %a) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> %a, zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind readnone
+  %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) nounwind readnone
   %cmp = icmp ne <2 x i32> %x, <i32 32, i32 32>
   ret <2 x i1> %cmp
 }
@@ -434,6 +529,14 @@ define i32 @ctlz_undef(i32 %Value) {
   ret i32 %ctlz
 }
 
+define <2 x i32> @ctlz_undef_vec(<2 x i32> %Value) {
+; CHECK-LABEL: @ctlz_undef_vec(
+; CHECK-NEXT:    ret <2 x i32> undef
+;
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> zeroinitializer, i1 true)
+  ret <2 x i32> %ctlz
+}
+
 define i32 @ctlz_make_undef(i32 %a) {
   %or = or i32 %a, 8
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %or, i1 false)
@@ -444,13 +547,31 @@ define i32 @ctlz_make_undef(i32 %a) {
 ; CHECK-NEXT: ret i32 %ctlz
 }
 
+define <2 x i32> @ctlz_make_undef_vec(<2 x i32> %a) {
+; CHECK-LABEL: @ctlz_make_undef_vec(
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[A:%.*]], <i32 8, i32 8>
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[OR]], i1 true)
+; CHECK-NEXT:    ret <2 x i32> [[CTLZ]]
+;
+  %or = or <2 x i32> %a, <i32 8, i32 8>
+  %ctlz = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %or, i1 false)
+  ret <2 x i32> %ctlz
+}
+
 define i32 @cttz_undef(i32 %Value) nounwind {
 ; CHECK-LABEL: @cttz_undef(
 ; CHECK-NEXT:    ret i32 undef
 ;
   %cttz = call i32 @llvm.cttz.i32(i32 0, i1 true)
   ret i32 %cttz
+}
 
+define <2 x i32> @cttz_undef_vec(<2 x i32> %Value) nounwind {
+; CHECK-LABEL: @cttz_undef_vec(
+; CHECK-NEXT:    ret <2 x i32> undef
+;
+  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> zeroinitializer, i1 true)
+  ret <2 x i32> %cttz
 }
 
 define i32 @cttz_make_undef(i32 %a) {
@@ -463,6 +584,17 @@ define i32 @cttz_make_undef(i32 %a) {
 ; CHECK-NEXT: ret i32 %cttz
 }
 
+define <2 x i32> @cttz_make_undef_vec(<2 x i32> %a) {
+; CHECK-LABEL: @cttz_make_undef_vec(
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[A:%.*]], <i32 8, i32 8>
+; CHECK-NEXT:    [[CTTZ:%.*]] = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[OR]], i1 true)
+; CHECK-NEXT:    ret <2 x i32> [[CTTZ]]
+;
+  %or = or <2 x i32> %a, <i32 8, i32 8>
+  %cttz = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %or, i1 false)
+  ret <2 x i32> %cttz
+}
+
 define i32 @ctlz_select(i32 %Value) nounwind {
 ; CHECK-LABEL: @ctlz_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 %Value, i1 false)
@@ -472,7 +604,17 @@ define i32 @ctlz_select(i32 %Value) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %Value, i1 true)
   %s = select i1 %tobool, i32 %ctlz, i32 32
   ret i32 %s
+}
 
+define <2 x i32> @ctlz_select_vec(<2 x i32> %Value) nounwind {
+; CHECK-LABEL: @ctlz_select_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VALUE:%.*]], i1 false)
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %tobool = icmp ne <2 x i32> %Value, zeroinitializer
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %Value, i1 true)
+  %s = select <2 x i1> %tobool, <2 x i32> %ctlz, <2 x i32> <i32 32, i32 32>
+  ret <2 x i32> %s
 }
 
 define i32 @cttz_select(i32 %Value) nounwind {
@@ -484,7 +626,17 @@ define i32 @cttz_select(i32 %Value) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %Value, i1 true)
   %s = select i1 %tobool, i32 %cttz, i32 32
   ret i32 %s
+}
 
+define <2 x i32> @cttz_select_vec(<2 x i32> %Value) nounwind {
+; CHECK-LABEL: @cttz_select_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[VALUE:%.*]], i1 false)
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %tobool = icmp ne <2 x i32> %Value, zeroinitializer
+  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %Value, i1 true)
+  %s = select <2 x i1> %tobool, <2 x i32> %cttz, <2 x i32> <i32 32, i32 32>
+  ret <2 x i32> %s
 }
 
 define i1 @overflow_div_add(i32 %v1, i32 %v2) nounwind {
diff --git a/test/Transforms/InstCombine/lshr.ll b/test/Transforms/InstCombine/lshr.ll
index 0cad7f833ab6..71b25177162b 100644
--- a/test/Transforms/InstCombine/lshr.ll
+++ b/test/Transforms/InstCombine/lshr.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
+target datalayout = "e-m:e-i64:64-n8:16:32:64"
+
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
@@ -100,12 +102,9 @@ define <2 x i8> @lshr_exact_splat_vec(<2 x i8> %x) {
   ret <2 x i8> %lshr
 }
 
-; FIXME: The bool bit got smeared across a wide val, but then we zero'd out those bits. This is just a zext.
-
 define i16 @bool_zext(i1 %x) {
 ; CHECK-LABEL: @bool_zext(
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 %x to i16
-; CHECK-NEXT:    [[HIBIT:%.*]] = lshr i16 [[SEXT]], 15
+; CHECK-NEXT:    [[HIBIT:%.*]] = zext i1 %x to i16
 ; CHECK-NEXT:    ret i16 [[HIBIT]]
 ;
   %sext = sext i1 %x to i16
@@ -115,8 +114,7 @@ define i16 @bool_zext(i1 %x) {
 
 define <2 x i8> @bool_zext_splat(<2 x i1> %x) {
 ; CHECK-LABEL: @bool_zext_splat(
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> %x to <2 x i8>
-; CHECK-NEXT:    [[HIBIT:%.*]] = lshr <2 x i8> [[SEXT]], <i8 7, i8 7>
+; CHECK-NEXT:    [[HIBIT:%.*]] = zext <2 x i1> %x to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[HIBIT]]
 ;
   %sext = sext <2 x i1> %x to <2 x i8>
@@ -148,23 +146,34 @@ define <2 x i8> @smear_sign_and_widen_splat(<2 x i6> %x) {
   ret <2 x i8> %hibit
 }
 
-; FIXME: All of the replicated sign bits are wiped out by the lshr. This could be lshr+zext.
-
-define i16 @fake_sext(i3 %x) {
+define i18 @fake_sext(i3 %x) {
 ; CHECK-LABEL: @fake_sext(
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i3 %x to i16
-; CHECK-NEXT:    [[SH:%.*]] = lshr i16 [[SEXT]], 15
-; CHECK-NEXT:    ret i16 [[SH]]
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i3 %x, 2
+; CHECK-NEXT:    [[SH:%.*]] = zext i3 [[TMP1]] to i18
+; CHECK-NEXT:    ret i18 [[SH]]
 ;
-  %sext = sext i3 %x to i16
-  %sh = lshr i16 %sext, 15
-  ret i16 %sh
+  %sext = sext i3 %x to i18
+  %sh = lshr i18 %sext, 17
+  ret i18 %sh
+}
+
+; Avoid the transform if it would change the shift from a legal to illegal type.
+
+define i32 @fake_sext_but_should_not_change_type(i3 %x) {
+; CHECK-LABEL: @fake_sext_but_should_not_change_type(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i3 %x to i32
+; CHECK-NEXT:    [[SH:%.*]] = lshr i32 [[SEXT]], 31
+; CHECK-NEXT:    ret i32 [[SH]]
+;
+  %sext = sext i3 %x to i32
+  %sh = lshr i32 %sext, 31
+  ret i32 %sh
 }
 
 define <2 x i8> @fake_sext_splat(<2 x i3> %x) {
 ; CHECK-LABEL: @fake_sext_splat(
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i3> %x to <2 x i8>
-; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[SEXT]], <i8 7, i8 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i3> %x, <i3 2, i3 2>
+; CHECK-NEXT:    [[SH:%.*]] = zext <2 x i3> [[TMP1]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %sext = sext <2 x i3> %x to <2 x i8>
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 68daac65ee6b..c7d10e251b4a 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -199,6 +199,16 @@ define i256 @test_cttz() {
   ret i256 %x
 }
 
+declare <2 x i256> @llvm.cttz.v2i256(<2 x i256> %src, i1 %is_zero_undef)
+
+define <2 x i256> @test_cttz_vec() {
+; CHECK-LABEL: @test_cttz_vec(
+; CHECK-NEXT:    ret <2 x i256> <i256 1, i256 1>
+;
+  %x = call <2 x i256> @llvm.cttz.v2i256(<2 x i256> <i256 10, i256 10>, i1 false)
+  ret <2 x i256> %x
+}
+
 declare i256 @llvm.ctpop.i256(i256 %src)
 
 define i256 @test_ctpop() {
@@ -410,3 +420,26 @@ define <8 x i32> @masked_load_undef_mask(<8 x i32>* %V) {
 declare noalias i8* @malloc(i64)
 
 declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
+
+declare double @llvm.powi.f64(double, i32)
+declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32)
+
+define double @constant_fold_powi() nounwind uwtable ssp {
+; CHECK-LABEL: @constant_fold_powi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret double 9.000000e+00
+;
+entry:
+  %0 = call double @llvm.powi.f64(double 3.00000e+00, i32 2)
+  ret double %0
+}
+
+define <2 x double> @constant_fold_powi_vec() nounwind uwtable ssp {
+; CHECK-LABEL: @constant_fold_powi_vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <2 x double> <double 9.000000e+00, double 2.500000e+01>
+;
+entry:
+  %0 = call <2 x double> @llvm.powi.v2f64(<2 x double> <double 3.00000e+00, double 5.00000e+00>, i32 2)
+  ret <2 x double> %0
+}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 20ebd36991a5..2fe079019161 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -69,7 +69,7 @@ define i1 @gep4() {
 
 define i1 @PR31262() {
 ; CHECK-LABEL: @PR31262(
-; CHECK-NEXT:    ret i1 icmp uge (i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 0, i64 undef), i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0))
+; CHECK-NEXT:    ret i1 icmp uge (i32* getelementptr ([1 x i32], [1 x i32]* @a, i32 0, i32 undef), i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0))
 ;
   %idx = getelementptr inbounds [1 x i32], [1 x i32]* @a, i64 0, i64 undef
   %cmp = icmp uge i32* %idx, getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0)
diff --git a/test/Transforms/InstSimplify/simplify-nested-bitcast.ll b/test/Transforms/InstSimplify/simplify-nested-bitcast.ll
new file mode 100644
index 000000000000..b7ee79415a22
--- /dev/null
+++ b/test/Transforms/InstSimplify/simplify-nested-bitcast.ll
@@ -0,0 +1,54 @@
+; RUN: opt -always-inline -S %s | FileCheck %s
+%0 = type { i64, i64, i8 addrspace(1)*, i8 addrspace(1)* }
+%__aaa_struct = type { { i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(1)* }, %0, [17 x i8], { i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(1)* }, %0, [18 x i8] }
+%struct.__block_descriptor = type { i64, i64 }
+%struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(1)* }
+
+@__aaa_struct_ptr = external addrspace(1) global %__aaa_struct
+@__aaa_const_init = constant %__aaa_struct { { i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(1)* } { i8** null, i32 1342177280, i32 0, i8* bitcast (i32 (i8 addrspace(4)*, i32 addrspace(1)*)* @bl0_block_invoke to i8*), %struct.__block_descriptor addrspace(1)* bitcast (%0 addrspace(1)* getelementptr inbounds (%__aaa_struct, %__aaa_struct addrspace(1)* @__aaa_struct_ptr, i32 0, i32 1) to %struct.__block_descriptor addrspace(1)*) }, %0 { i64 0, i64 32, i8 addrspace(1)* getelementptr inbounds (%__aaa_struct, %__aaa_struct addrspace(1)* @__aaa_struct_ptr, i32 0, i32 2, i32 0), i8 addrspace(1)* null }, [17 x i8] c"bl0_block_invoke\00", { i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(1)* } { i8** null, i32 1342177280, i32 0, i8* bitcast (i32 (i8 addrspace(4)*, i32 addrspace(1)*)* @__f1_block_invoke to i8*), %struct.__block_descriptor addrspace(1)* bitcast (%0 addrspace(1)* getelementptr inbounds (%__aaa_struct, %__aaa_struct addrspace(1)* @__aaa_struct_ptr, i32 0, i32 4) to %struct.__block_descriptor addrspace(1)*) }, %0 { i64 0, i64 32, i8 addrspace(1)* getelementptr inbounds (%__aaa_struct, %__aaa_struct addrspace(1)* @__aaa_struct_ptr, i32 0, i32 5, i32 0), i8 addrspace(1)* null }, [18 x i8] c"__f1_block_invoke\00" }
+
+; Function Attrs: alwaysinline norecurse nounwind readonly
+define i32 @bl0_block_invoke(i8 addrspace(4)* nocapture readnone, i32 addrspace(1)* nocapture readonly) #0 {
+entry:
+  %2 = load i32, i32 addrspace(1)* %1, align 4
+  %mul = shl nsw i32 %2, 1
+  ret i32 %mul
+}
+
+; Function Attrs: alwaysinline nounwind
+define i32 @f0(i32 addrspace(1)*, i32 (i32 addrspace(1)*) addrspace(4)*) #1 {
+entry:
+  %block.literal = bitcast i32 (i32 addrspace(1)*) addrspace(4)* %1 to %struct.__block_literal_generic addrspace(4)*
+  %2 = getelementptr inbounds %struct.__block_literal_generic, %struct.__block_literal_generic addrspace(4)* %block.literal, i64 0, i32 3
+  %3 = bitcast i32 (i32 addrspace(1)*) addrspace(4)* %1 to i8 addrspace(4)*
+  %4 = bitcast i8* addrspace(4)* %2 to i32 (i8 addrspace(4)*, i32 addrspace(1)*)* addrspace(4)*
+  %5 = load i32 (i8 addrspace(4)*, i32 addrspace(1)*)*, i32 (i8 addrspace(4)*, i32 addrspace(1)*)* addrspace(4)* %4, align 8
+  %call = tail call i32 %5(i8 addrspace(4)* %3, i32 addrspace(1)* %0) #2
+  ret i32 %call
+}
+
+; CHECK-LABEL: define void @f1
+; CHECK: %1 = load i32 (i8 addrspace(4)*, i32 addrspace(1)*)*, i32 (i8 addrspace(4)*, i32 addrspace(1)*)* addrspace(4)* bitcast (i8* addrspace(4)* getelementptr (%__aaa_struct, %__aaa_struct addrspace(4)* addrspacecast (%__aaa_struct addrspace(1)* @__aaa_struct_ptr to %__aaa_struct addrspace(4)*), i64 0, i32 0, i32 3) to i32 (i8 addrspace(4)*, i32 addrspace(1)*)* addrspace(4)*), align 8
+
+; Function Attrs: alwaysinline nounwind
+define void @f1(i32 addrspace(1)*) #1 {
+entry:
+  %call = tail call i32 @f0(i32 addrspace(1)* %0, i32 (i32 addrspace(1)*) addrspace(4)* addrspacecast (i32 (i32 addrspace(1)*) addrspace(1)* bitcast (%__aaa_struct addrspace(1)* @__aaa_struct_ptr to i32 (i32 addrspace(1)*) addrspace(1)*) to i32 (i32 addrspace(1)*) addrspace(4)*)) #3
+  store i32 %call, i32 addrspace(1)* %0, align 4
+  %call1 = tail call i32 @f0(i32 addrspace(1)* %0, i32 (i32 addrspace(1)*) addrspace(4)* addrspacecast (i32 (i32 addrspace(1)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(1)* } addrspace(1)* getelementptr inbounds (%__aaa_struct, %__aaa_struct addrspace(1)* @__aaa_struct_ptr, i32 0, i32 3) to i32 (i32 addrspace(1)*) addrspace(1)*) to i32 (i32 addrspace(1)*) addrspace(4)*)) #3
+  store i32 %call1, i32 addrspace(1)* %0, align 4
+  ret void
+}
+
+; Function Attrs: alwaysinline norecurse nounwind readonly
+define i32 @__f1_block_invoke(i8 addrspace(4)* nocapture readnone, i32 addrspace(1)* nocapture readonly) #0 {
+entry:
+  %2 = load i32, i32 addrspace(1)* %1, align 4
+  %add = add nsw i32 %2, 1
+  ret i32 %add
+}
+
+attributes #0 = { alwaysinline norecurse nounwind readonly }
+attributes #1 = { alwaysinline nounwind }
+attributes #2 = { nobuiltin nounwind }
+attributes #3 = { nobuiltin }
diff --git a/test/Transforms/InstSimplify/vector_gep.ll b/test/Transforms/InstSimplify/vector_gep.ll
index b8e61a05cc0c..cdf4732d4b5e 100644
--- a/test/Transforms/InstSimplify/vector_gep.ll
+++ b/test/Transforms/InstSimplify/vector_gep.ll
@@ -51,7 +51,7 @@ define <4 x i8*> @test5() {
   ret <4 x i8*> %gep
 
 ; CHECK-LABEL: @test5
-; CHECK-NEXT: ret <4 x i8*> getelementptr (i8, <4 x i8*> <i8* inttoptr (i64 1 to i8*), i8* inttoptr (i64 2 to i8*), i8* inttoptr (i64 3 to i8*), i8* inttoptr (i64 4 to i8*)>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT: ret <4 x i8*> getelementptr (i8, <4 x i8*> <i8* inttoptr (i64 1 to i8*), i8* inttoptr (i64 2 to i8*), i8* inttoptr (i64 3 to i8*), i8* inttoptr (i64 4 to i8*)>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>)
 }
 
 @v = global [24 x [42 x [3 x i32]]] zeroinitializer, align 16
diff --git a/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll b/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll
index bf2009e28a7d..1f444b3748a5 100644
--- a/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll
+++ b/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-pc-linux  -mattr=+avx -interleaved-access -S | FileCheck %s
 
-; This file tests the function `llvm::lowerInterleavedLoad`.
+; This file tests the function `llvm::lowerInterleavedLoad/Store`.
 
 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 ; CHECK-LABEL: @load_factorf64_4(
@@ -102,4 +102,63 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
   ret <4 x double> %mul
 }
 
+define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
+; CHECK-LABEL: @store_factorf64_4(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], <16 x double>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
+  ret void
+}
+
+define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
+; CHECK-LABEL: @store_factori64_4(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i64> [[V2:%.*]], <4 x i64> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], <16 x i64>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16
+  ret void
+}
+
+define void @store_factorf64_4_revMask(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
+; CHECK-LABEL: @store_factorf64_4_revMask(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <16 x i32> <i32 12, i32 8, i32 4, i32 0, i32 13, i32 9, i32 5, i32 1, i32 14, i32 10, i32 6, i32 2, i32 15, i32 11, i32 7, i32 3>
+; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], <16 x double>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 12, i32 8, i32 4, i32 0, i32 13, i32 9, i32 5, i32 1, i32 14, i32 10, i32 6, i32 2, i32 15, i32 11, i32 7, i32 3>
+  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
+  ret void
+}
+
+define void @store_factorf64_4_arbitraryMask(<16 x double>* %ptr, <16 x double> %v0, <16 x double> %v1, <16 x double> %v2, <16 x double> %v3) {
+; CHECK-LABEL: @store_factorf64_4_arbitraryMask(
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <16 x double> [[V0:%.*]], <16 x double> [[V1:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x double> [[V2:%.*]], <16 x double> [[V3:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x double> [[S0]], <32 x double> [[S1]], <16 x i32> <i32 4, i32 32, i32 16, i32 8, i32 5, i32 33, i32 17, i32 9, i32 6, i32 34, i32 18, i32 10, i32 7, i32 35, i32 19, i32 11>
+; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], <16 x double>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %s0 = shufflevector <16 x double> %v0, <16 x double> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s1 = shufflevector <16 x double> %v2, <16 x double> %v3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %interleaved.vec = shufflevector <32 x double> %s0, <32 x double> %s1, <16 x i32> <i32 4, i32 32, i32 16, i32 8, i32 5, i32 33, i32 17, i32 9, i32 6, i32 34, i32 18, i32 10, i32 7, i32 35, i32 19, i32 11>
+  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
+  ret void
+}
 
diff --git a/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll b/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll
new file mode 100644
index 000000000000..ec93847178b5
--- /dev/null
+++ b/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll
@@ -0,0 +1,452 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; memcpy.atomic formation (atomic load & store)
+define void @test1(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test1(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic store, normal load)
+define void @test2(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014, align 1
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic store, normal load w/ no align)
+define void @test2b(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2b(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic store, normal load w/ bad align)
+define void @test2c(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2c(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 2
+  store atomic i32 %V, i32* %DestI unordered, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic store w/ bad align, normal load)
+define void @test2d(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2d(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 4
+  store atomic i32 %V, i32* %DestI unordered, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; memcpy.atomic formation (normal store, atomic load)
+define void @test3(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store i8 %V, i8* %DestI, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (normal store w/ no align, atomic load)
+define void @test3b(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3b(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store i8 %V, i8* %DestI
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (normal store, atomic load w/ bad align)
+define void @test3c(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3c(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load atomic i32, i32* %I.0.014 unordered, align 2
+  store i32 %V, i32* %DestI, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (normal store w/ bad align, atomic load)
+define void @test3d(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3d(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load atomic i32, i32* %I.0.014 unordered, align 4
+  store i32 %V, i32* %DestI, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; memcpy.atomic formation rejection (atomic load, ordered-atomic store)
+define void @test4(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test4(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store atomic i8 %V, i8* %DestI monotonic, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (ordered-atomic load, unordered-atomic store)
+define void @test5(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test5(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 monotonic, align 1
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic load & store) -- element size 2
+define void @test6(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test6(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 2 %Dest{{[0-9]*}}, i8* align 2 %Base{{[0-9]*}}, i64 %Size, i32 2)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i16, i32 10000
+  %Dest = alloca i16, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar
+  %DestI = getelementptr i16, i16* %Dest, i64 %indvar
+  %V = load atomic i16, i16* %I.0.014 unordered, align 2
+  store atomic i16 %V, i16* %DestI unordered, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic load & store) -- element size 4
+define void @test7(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test7(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dest{{[0-9]*}}, i8* align 4 %Base{{[0-9]*}}, i64 %Size, i32 4)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load atomic i32, i32* %I.0.014 unordered, align 4
+  store atomic i32 %V, i32* %DestI unordered, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic load & store) -- element size 8
+define void @test8(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test8(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 8 %Dest{{[0-9]*}}, i8* align 8 %Base{{[0-9]*}}, i64 %Size, i32 8)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i64, i32 10000
+  %Dest = alloca i64, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i64, i64* %Base, i64 %indvar
+  %DestI = getelementptr i64, i64* %Dest, i64 %indvar
+  %V = load atomic i64, i64* %I.0.014 unordered, align 8
+  store atomic i64 %V, i64* %DestI unordered, align 8
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic load & store) -- element size 16
+define void @test9(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test9(
+; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dest{{[0-9]*}}, i8* align 16 %Base{{[0-9]*}}, i64 %Size, i32 16)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i128, i32 10000
+  %Dest = alloca i128, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i128, i128* %Base, i64 %indvar
+  %DestI = getelementptr i128, i128* %Dest, i64 %indvar
+  %V = load atomic i128, i128* %I.0.014 unordered, align 16
+  store atomic i128 %V, i128* %DestI unordered, align 16
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic load & store) -- element size 32
+define void @test10(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test10(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i256, i32 10000
+  %Dest = alloca i256, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i256, i256* %Base, i64 %indvar
+  %DestI = getelementptr i256, i256* %Dest, i64 %indvar
+  %V = load atomic i256, i256* %I.0.014 unordered, align 32
+  store atomic i256 %V, i256* %DestI unordered, align 32
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; Make sure that atomic memset doesn't get recognized by mistake
+define void @test_nomemset(i8* %Base, i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test_nomemset(
+; CHECK-NOT: call void @llvm.memset
+; CHECK: store
+; CHECK: ret void
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store atomic i8 0, i8* %I.0.014 unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Verify that unordered memset_pattern isn't recognized.
+; This is a replica of test11_pattern from basic.ll
+define void @test_nomemset_pattern(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test_nomemset_pattern(
+; CHECK-NEXT: entry:
+; CHECK-NOT: bitcast
+; CHECK-NOT: memset_pattern
+; CHECK: store atomic
+; CHECK: ret void
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32, i32* %P, i64 %indvar
+  store atomic i32 1, i32* %arrayidx unordered, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll b/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll
new file mode 100644
index 000000000000..b2528f1c2457
--- /dev/null
+++ b/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll
@@ -0,0 +1,28 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+;; memcpy.atomic formation (atomic load & store) -- element size 2
+;;  Will not create call due to a max element size of 0
+define void @test1(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test1(
+; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i16, i32 10000
+  %Dest = alloca i16, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar
+  %DestI = getelementptr i16, i16* %Dest, i64 %indvar
+  %V = load atomic i16, i16* %I.0.014 unordered, align 2
+  store atomic i16 %V, i16* %DestI unordered, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/canonical.ll b/test/Transforms/LoopStrengthReduce/X86/canonical.ll
index 2dafbb408aad..6b6acb868745 100644
--- a/test/Transforms/LoopStrengthReduce/X86/canonical.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/canonical.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -S < %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -lsr-insns-cost=false -S < %s | FileCheck %s
 ; Check LSR formula canonicalization will put loop invariant regs before
 ; induction variable of current loop, so exprs involving loop invariant regs
 ; can be promoted outside of current loop.
diff --git a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index fb63b66137f3..7c01432914ff 100644
--- a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -163,7 +163,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; X64: movzbl -3(
 ;
 ; X32: foldedidx:
-; X32: movzbl -3(
+; X32: movzbl 400(
 define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp {
 entry:
   br label %for.body
@@ -275,7 +275,7 @@ exit:
 ;
 ; X32: @testCmpZero
 ; X32: %for.body82.us
-; X32: dec
+; X32: cmp
 ; X32: jne
 define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
index a7731bfcec56..deca954fea78 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
@@ -1,4 +1,4 @@
-; REQUIRES: x86
+; REQUIRES: x86-registered-target
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 
 ; Strength reduction analysis here relies on IV Users analysis, that
@@ -22,16 +22,16 @@ target triple = "x86_64-apple-macosx"
 ; CHECK-LABEL: @test2
 ; CHECK-LABEL: test2.loop:
 ; CHECK:  %lsr.iv1 = phi i32 [ %lsr.iv.next2, %test2.loop ], [ -16777216, %entry ]
-; CHECK:  %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -1, %entry ]
-; CHECK:  %lsr.iv.next = add nsw i32 %lsr.iv, 1
+; CHECK:  %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ 1, %entry ]
+; CHECK:  %lsr.iv.next = add nsw i32 %lsr.iv, -1
 ; CHECK:  %lsr.iv.next2 = add nsw i32 %lsr.iv1, 16777216
 ;
 ; CHECK-LABEL: for.end:
-; CHECK:  %tobool.us = icmp eq i32 %lsr.iv.next2, 0
+; CHECK:  %tobool.us = icmp eq i32 %lsr.iv.next, 0
 ; CHECK:  %sub.us = select i1 %tobool.us, i32 0, i32 0
-; CHECK:  %1 = sub i32 0, %sub.us
-; CHECK:  %2 = add i32 %1, %lsr.iv.next
-; CHECK:  %sext.us = mul i32 %lsr.iv.next2, %2
+; CHECK:  %0 = sub i32 0, %sub.us
+; CHECK:  %1 = sub i32 %0, %lsr.iv.next
+; CHECK:  %sext.us = mul i32 %lsr.iv.next2, %1
 ; CHECK:  %f = ashr i32 %sext.us, 24
 ; CHECK: ret i32 %f
 define i32 @test2() {
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
index 4888536bdf81..7f163500a737 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
-; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: opt < %s -loop-reduce -mtriple=x86_64  -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost=false -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
 ; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
 
 ; OPT test checks that LSR optimize compare for static counter to compare with 0.
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
index 3273cb4e6b5b..239cc0233506 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
-; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost=false -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
 ; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
 
 ; OPT checks that LSR prefers less instructions to less registers.
diff --git a/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll b/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
index b563eb3ad994..e05d5aa3027b 100644
--- a/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 ; Check when we use an outerloop induction variable inside of an innerloop
 ; induction value expr, LSR can still choose to use single induction variable
@@ -22,18 +23,21 @@ for.body:                                         ; preds = %for.inc, %entry
 for.body2.preheader:                              ; preds = %for.body
   br label %for.body2
 
-; Check LSR only generates one induction variable for for.body2 and the induction
-; variable will be shared by multiple array accesses.
+; Check LSR only generates two induction variables for for.body2 one for compare and
+; one to shared by multiple array accesses.
 ; CHECK: for.body2:
-; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ %lsr.iv.next, %for.body2 ], [ 0, %for.body2.preheader ]
+; CHECK-NEXT: [[LSRAR:%[^,]+]] = phi i8* [ %scevgep, %for.body2 ], [ %maxarray, %for.body2.preheader ]
+; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ %lsr.iv.next, %for.body2 ], [ %0, %for.body2.preheader ]
 ; CHECK-NOT:  = phi i64 [ {{.*}}, %for.body2 ], [ {{.*}}, %for.body2.preheader ]
-; CHECK:      [[SCEVGEP1:%[^,]+]] = getelementptr i8, i8* %maxarray, i64 [[LSR]]
-; CHECK:      [[SCEVGEP2:%[^,]+]] = getelementptr i8, i8* [[SCEVGEP1]], i64 1
+; CHECK:      [[LSRINT:%[^,]+]] = ptrtoint i8* [[LSRAR]] to i64
+; CHECK:      [[SCEVGEP1:%[^,]+]] = getelementptr i8, i8* [[LSRAR]], i64 1
+; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP1]], align 1
+; CHECK:      [[SCEVGEP2:%[^,]+]] = getelementptr i8, i8* %1, i64 [[LSRINT]]
 ; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP2]], align 1
-; CHECK:      [[SCEVGEP3:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
-; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP3]], align 1
-; CHECK:      [[SCEVGEP4:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
-; CHECK:      store i8 {{.*}}, i8* [[SCEVGEP4]], align 1
+; CHECK:      [[SCEVGEP3:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSRINT]]
+; CHECK:      store i8 {{.*}}, i8* [[SCEVGEP3]], align 1
+; CHECK:      [[LSRNEXT:%[^,]+]] = add i64 [[LSR]], -1
+; CHECK:      %exitcond = icmp ne i64 [[LSRNEXT]], 0
 ; CHECK:      br i1 %exitcond, label %for.body2, label %for.inc.loopexit
 
 for.body2:                                        ; preds = %for.body2.preheader, %for.body2
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index d06e3fdba39c..1149afe7b9f4 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -5,7 +5,7 @@ target triple = "aarch64"
 
 ; CHECK-LABEL: @add_a(
 ; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i8>
+; CHECK: add <16 x i8>
 ; CHECK: store <16 x i8>
 ; Function Attrs: nounwind
 define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
@@ -31,9 +31,37 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
+; working with.
+; CHECK-LABEL: @add_a1(
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %add = add nuw nsw i8 %0, 2
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 ; CHECK-LABEL: @add_b(
 ; CHECK: load <8 x i16>, <8 x i16>*
-; CHECK: add nuw nsw <8 x i16>
+; CHECK: add <8 x i16>
 ; CHECK: store <8 x i16>
 ; Function Attrs: nounwind
 define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
@@ -61,7 +89,7 @@ for.body:                                         ; preds = %entry, %for.body
 
 ; CHECK-LABEL: @add_c(
 ; CHECK: load <8 x i8>, <8 x i8>*
-; CHECK: add nuw nsw <8 x i16>
+; CHECK: add <8 x i16>
 ; CHECK: store <8 x i16>
 ; Function Attrs: nounwind
 define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
@@ -116,12 +144,12 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: @add_e(
 ; CHECK: load <16 x i8>
 ; CHECK: shl <16 x i8>
-; CHECK: add nuw nsw <16 x i8>
+; CHECK: add <16 x i8>
 ; CHECK: or <16 x i8>
-; CHECK: mul nuw nsw <16 x i8>
+; CHECK: mul <16 x i8>
 ; CHECK: and <16 x i8>
 ; CHECK: xor <16 x i8>
-; CHECK: mul nuw nsw <16 x i8>
+; CHECK: mul <16 x i8>
 ; CHECK: store <16 x i8>
 define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 entry:
@@ -162,12 +190,12 @@ for.body:                                         ; preds = %for.body, %for.body
 ; CHECK: load <8 x i16>
 ; CHECK: trunc <8 x i16>
 ; CHECK: shl <8 x i8>
-; CHECK: add nsw <8 x i8>
+; CHECK: add <8 x i8>
 ; CHECK: or <8 x i8>
-; CHECK: mul nuw nsw <8 x i8>
+; CHECK: mul <8 x i8>
 ; CHECK: and <8 x i8>
 ; CHECK: xor <8 x i8>
-; CHECK: mul nuw nsw <8 x i8>
+; CHECK: mul <8 x i8>
 ; CHECK: store <8 x i8>
 define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 entry:
diff --git a/test/Transforms/LowerExpectIntrinsic/PR33346.ll b/test/Transforms/LowerExpectIntrinsic/PR33346.ll
new file mode 100644
index 000000000000..ca962fbdc8f3
--- /dev/null
+++ b/test/Transforms/LowerExpectIntrinsic/PR33346.ll
@@ -0,0 +1,22 @@
+; RUN: opt -lower-expect -S < %s
+; RUN: opt -passes='function(lower-expect)' -S < %s
+
+define i64 @foo(i64 %arg) #0 {
+bb:
+  %tmp = alloca i64, align 8
+  store i64 %arg, i64* %tmp, align 8
+  %tmp1 = load i64, i64* %tmp, align 8
+  %tmp2 = load i64, i64* %tmp, align 8
+  %tmp3 = call i64 @llvm.expect.i64(i64 %tmp1, i64 %tmp2)
+  ret i64 %tmp3
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.expect.i64(i64, i64)
+
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304723)"}
diff --git a/test/Transforms/LowerTypeTests/simple.ll b/test/Transforms/LowerTypeTests/simple.ll
index cedfcb4a63a0..aae17c05d606 100644
--- a/test/Transforms/LowerTypeTests/simple.ll
+++ b/test/Transforms/LowerTypeTests/simple.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -lowertypetests < %s | FileCheck %s
-; RUN: opt -S -lowertypetests -mtriple=x86_64-apple-macosx10.8.0 < %s | FileCheck -check-prefix=CHECK-DARWIN %s
+; RUN: opt -S -lowertypetests -mtriple=x86_64-apple-macosx10.8.0 < %s | FileCheck %s
 ; RUN: opt -S -O3 < %s | FileCheck -check-prefix=CHECK-NODISCARD %s
 
 target datalayout = "e-p:32:32"
@@ -39,20 +39,6 @@ target datalayout = "e-p:32:32"
 ; CHECK: @c = protected alias i32, getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }, { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 4)
 ; CHECK: @d = alias [2 x i32], getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }, { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 6)
 
-; CHECK-DARWIN: @aptr = constant i32* getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }, { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G:@[^ ]*]], i32 0, i32 0)
-@aptr = constant i32* @a
-
-; CHECK-DARWIN: @bptr = constant [63 x i32]* getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }, { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 2)
-@bptr = constant [63 x i32]* @b
-
-; CHECK-DARWIN: @cptr = constant i32* getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }, { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 4)
-@cptr = constant i32* @c
-
-; CHECK-DARWIN: @dptr = constant [2 x i32]* getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }, { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 6)
-@dptr = constant [2 x i32]* @d
-
-; CHECK-DARWIN: [[G]] = private constant
-
 ; CHECK: @bits{{[0-9]*}} = private alias i8, getelementptr inbounds ([68 x i8], [68 x i8]* [[BA]], i32 0, i32 0)
 ; CHECK: @bits.{{[0-9]*}} = private alias i8, getelementptr inbounds ([68 x i8], [68 x i8]* [[BA]], i32 0, i32 0)
 
diff --git a/test/Transforms/LowerTypeTests/simplify.ll b/test/Transforms/LowerTypeTests/simplify.ll
new file mode 100644
index 000000000000..cb5ad4a10bfb
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/simplify.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%S/Inputs/import.yaml < %s | FileCheck %s
+
+target datalayout = "e-p:64:64"
+
+declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+; CHECK: define i1 @bytearray7(i8* [[p:%.*]])
+define i1 @bytearray7(i8* %p) {
+  ; CHECK-NEXT: [[pi:%.*]] = ptrtoint i8* [[p]] to i64
+  ; CHECK-NEXT: [[sub:%.*]] = sub i64 [[pi]], ptrtoint (i8* @__typeid_bytearray7_global_addr to i64)
+  ; CHECK-NEXT: [[lshr:%.*]] = lshr i64 [[sub]], zext (i8 ptrtoint (i8* @__typeid_bytearray7_align to i8) to i64)
+  ; CHECK-NEXT: [[shl:%.*]] = shl i64 [[sub]], zext (i8 sub (i8 64, i8 ptrtoint (i8* @__typeid_bytearray7_align to i8)) to i64)
+  ; CHECK-NEXT: [[or:%.*]] = or i64 [[lshr]], [[shl]]
+  ; CHECK-NEXT: [[ule:%.*]] = icmp ule i64 [[or]], ptrtoint (i8* @__typeid_bytearray7_size_m1 to i64)
+  ; CHECK-NEXT: br i1 [[ule]], label %[[t1:.*]], label %[[f:.*]]
+
+  ; CHECK: [[t1]]:
+  ; CHECK-NEXT: [[gep:%.*]] = getelementptr i8, i8* @__typeid_bytearray7_byte_array, i64 [[or]]
+  ; CHECK-NEXT: [[load:%.*]] = load i8, i8* [[gep]]
+  ; CHECK-NEXT: [[and:%.*]] = and i8 [[load]], ptrtoint (i8* @__typeid_bytearray7_bit_mask to i8)
+  ; CHECK-NEXT: [[ne:%.*]] = icmp ne i8 [[and]], 0
+  ; CHECK-NEXT: br i1 [[ne]], label %[[t:.*]], label %[[f:.*]]
+
+  ; CHECK: [[t]]:
+  ; CHECK-NEXT: ret i1 true
+
+  ; CHECK: [[f]]:
+  ; CHECK-NEXT: ret i1 false
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"bytearray7")
+  br i1 %x, label %t, label %f
+
+t:
+  ret i1 true
+
+f:
+  ret i1 false
+}
diff --git a/test/Transforms/NewGVN/completeness.ll b/test/Transforms/NewGVN/completeness.ll
index 2b28f12df9d1..1798bfea5fe0 100644
--- a/test/Transforms/NewGVN/completeness.ll
+++ b/test/Transforms/NewGVN/completeness.ll
@@ -395,7 +395,7 @@ define void @test10() {
 ; CHECK:       g:
 ; CHECK-NEXT:    [[N:%.*]] = phi i32* [ [[H:%.*]], [[I:%.*]] ], [ null, [[B:%.*]] ]
 ; CHECK-NEXT:    [[H]] = getelementptr i32, i32* [[N]], i64 1
-; CHECK-NEXT:    [[J:%.*]] = icmp eq i32* [[H]], getelementptr (i32, i32* null, i64 8)
+; CHECK-NEXT:    [[J:%.*]] = icmp eq i32* [[H]], inttoptr (i64 32 to i32*)
 ; CHECK-NEXT:    br i1 [[J]], label [[C:%.*]], label [[I]]
 ; CHECK:       i:
 ; CHECK-NEXT:    br i1 undef, label [[K:%.*]], label [[G]]
diff --git a/test/Transforms/NewGVN/loadforward.ll b/test/Transforms/NewGVN/loadforward.ll
index d66b5332601f..b4cbcc6b0f4d 100644
--- a/test/Transforms/NewGVN/loadforward.ll
+++ b/test/Transforms/NewGVN/loadforward.ll
@@ -9,8 +9,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ;; Test that we forward the first store to the second load
 define i16 @bazinga() {
 ; CHECK-LABEL: @bazinga(
-; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, i16* getelementptr inbounds (%rec11, %rec11* @str, i16 0, i32 1)
-; CHECK-NEXT:    store i16 [[_TMP10]], i16* getelementptr inbounds (%rec11, %rec11* @str, i16 0, i32 0)
+; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, i16* getelementptr inbounds (%rec11, %rec11* @str, i64 0, i32 1)
+; CHECK-NEXT:    store i16 [[_TMP10]], i16* getelementptr inbounds (%rec11, %rec11* @str, i64 0, i32 0)
 ; CHECK-NEXT:    [[_TMP15:%.*]] = icmp eq i16 [[_TMP10]], 3
 ; CHECK-NEXT:    [[_TMP16:%.*]] = select i1 [[_TMP15]], i16 1, i16 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
diff --git a/test/Transforms/NewGVN/pr32403.ll b/test/Transforms/NewGVN/pr32403.ll
index 505d31a9463e..2552e0e66ab9 100644
--- a/test/Transforms/NewGVN/pr32403.ll
+++ b/test/Transforms/NewGVN/pr32403.ll
@@ -17,8 +17,7 @@ define void @reorder_ref_pic_list() local_unnamed_addr {
 ; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[REFIDXLX_0]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY8_I:%.*]]
 ; CHECK:       for.body8.i:
-; CHECK-NEXT:    [[NIDX_052_I:%.*]] = phi i32 [ [[INC_I]], [[IF_THEN13]] ], [ [[NIDX_052_I]], [[FOR_INC24_I:%.*]] ]
-; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I]], label [[IF_THEN17_I:%.*]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I:%.*]], label [[IF_THEN17_I:%.*]]
 ; CHECK:       if.then17.i:
 ; CHECK-NEXT:    br label [[FOR_INC24_I]]
 ; CHECK:       for.inc24.i:
diff --git a/test/Transforms/NewGVN/pr32897.ll b/test/Transforms/NewGVN/pr32897.ll
index eb19aa367b72..dcf2af30b239 100644
--- a/test/Transforms/NewGVN/pr32897.ll
+++ b/test/Transforms/NewGVN/pr32897.ll
@@ -7,7 +7,6 @@ define void @tinkywinky(i64* %b) {
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    store i64 undef, i64* [[B:%.*]]
-; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* [[B]]
 ; CHECK-NEXT:    br i1 undef, label [[BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    br label [[BODY]]
diff --git a/test/Transforms/NewGVN/pr33187.ll b/test/Transforms/NewGVN/pr33187.ll
new file mode 100644
index 000000000000..61e767d36569
--- /dev/null
+++ b/test/Transforms/NewGVN/pr33187.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;; Ensure we don't change after value numbering by accidentally deleting the wrong expression.
+; RUN: opt -newgvn -S %s | FileCheck %s
+define void @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND_PREHEADER:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    br label [[FOR_COND_PREHEADER]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    [[H_031:%.*]] = phi i32 [ 5, [[ENTRY:%.*]] ], [ [[H_127:%.*]], [[WHILE_COND:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[H_128:%.*]] = phi i32 [ [[H_031]], [[FOR_COND_PREHEADER]] ], [ [[H_2:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br i1 false, label [[L_LOOPEXIT:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC]], label [[IF_END9:%.*]]
+; CHECK:       if.end9:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[H_2]] = phi i32 [ [[H_128]], [[IF_END]] ], [ 0, [[IF_END9]] ]
+; CHECK-NEXT:    br i1 undef, label [[WHILE_COND10_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       while.cond10.loopexit:
+; CHECK-NEXT:    br label [[WHILE_COND10:%.*]]
+; CHECK:       while.cond10:
+; CHECK-NEXT:    [[H_127]] = phi i32 [ [[H_126:%.*]], [[IF_END18:%.*]] ], [ [[H_125:%.*]], [[L:%.*]] ], [ [[H_2]], [[WHILE_COND10_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 undef, label [[WHILE_COND]], label [[WHILE_BODY12:%.*]]
+; CHECK:       while.body12:
+; CHECK-NEXT:    br i1 undef, label [[IF_END18]], label [[L]]
+; CHECK:       L.loopexit:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       L:
+; CHECK-NEXT:    [[H_125]] = phi i32 [ [[H_127]], [[WHILE_BODY12]] ], [ undef, [[L_LOOPEXIT]] ]
+; CHECK-NEXT:    br i1 undef, label [[WHILE_COND10]], label [[IF_END18]]
+; CHECK:       if.end18:
+; CHECK-NEXT:    [[H_126]] = phi i32 [ [[H_125]], [[L]] ], [ [[H_127]], [[WHILE_BODY12]] ]
+; CHECK-NEXT:    br label [[WHILE_COND10]]
+;
+entry:
+  br label %for.cond.preheader
+
+while.cond:                                       ; preds = %while.cond10
+  br label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %while.cond, %entry
+  %h.031 = phi i32 [ 5, %entry ], [ %h.127, %while.cond ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.cond.preheader
+  %h.128 = phi i32 [ %h.031, %for.cond.preheader ], [ %h.2, %for.inc ]
+  br label %if.then
+
+if.then:                                          ; preds = %for.body
+  br i1 false, label %L.loopexit, label %if.end
+
+if.end:                                           ; preds = %if.then
+  br i1 undef, label %for.inc, label %if.end9
+
+if.end9:                                          ; preds = %if.end
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end9, %if.end
+  %h.2 = phi i32 [ %h.128, %if.end ], [ 0, %if.end9 ]
+  br i1 undef, label %while.cond10.loopexit, label %for.body
+
+while.cond10.loopexit:                            ; preds = %for.inc
+  %h.2.lcssa = phi i32 [ %h.2, %for.inc ]
+  br label %while.cond10
+
+while.cond10:                                     ; preds = %if.end18, %L, %while.cond10.loopexit
+  %h.127 = phi i32 [ %h.126, %if.end18 ], [ %h.125, %L ], [ %h.2.lcssa, %while.cond10.loopexit ]
+  br i1 undef, label %while.cond, label %while.body12
+
+while.body12:                                     ; preds = %while.cond10
+  br i1 undef, label %if.end18, label %L
+
+L.loopexit:                                       ; preds = %if.then
+  br label %L
+
+L:                                                ; preds = %L.loopexit, %while.body12
+  %h.125 = phi i32 [ %h.127, %while.body12 ], [ undef, %L.loopexit ]
+  br i1 undef, label %while.cond10, label %if.end18
+
+if.end18:                                         ; preds = %L, %while.body12
+  %h.126 = phi i32 [ %h.125, %L ], [ %h.127, %while.body12 ]
+  br label %while.cond10
+}
+
+
+define void @hoge() local_unnamed_addr #0 {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[TMP2:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[TMP2]] = add nuw nsw i64 [[TMP]], 1
+; CHECK-NEXT:    br label [[BB1]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp2, %bb1 ]
+  %tmp2 = add nuw nsw i64 %tmp, 1
+  br label %bb1
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
+source_filename = "pr33187-c.ll"
+
+define void @a() {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:  b:
+; CHECK-NEXT:    store i8* null, i8** null
+; CHECK-NEXT:    br label [[D:%.*]]
+; CHECK:       d:
+; CHECK-NEXT:    [[I:%.*]] = phi i8* [ null, [[B:%.*]] ], [ [[E:%.*]], [[F:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[F]], label [[G:%.*]]
+; CHECK:       g:
+; CHECK-NEXT:    store i8* [[I]], i8** null
+; CHECK-NEXT:    unreachable
+; CHECK:       f:
+; CHECK-NEXT:    [[E]] = getelementptr i8, i8* [[I]], i64 1
+; CHECK-NEXT:    br label [[D]]
+;
+b:
+  store i8* null, i8** null
+  br label %d
+
+d:                                                ; preds = %f, %b
+  %i = phi i8* [ null, %b ], [ %e, %f ]
+  br i1 undef, label %f, label %g
+
+g:                                                ; preds = %d
+  %h = phi i8* [ %i, %d ]
+  store i8* %h, i8** null
+  unreachable
+
+f:                                                ; preds = %d
+  %e = getelementptr i8, i8* %i, i64 1
+  br label %d
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/test/Transforms/SLPVectorizer/X86/arith-fp.ll
index 7eec13e535d4..e00ed849ee4b 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -10,7 +10,7 @@
 
 define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -30,7 +30,7 @@ define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -50,7 +50,7 @@ define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -70,7 +70,7 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 
 define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -90,7 +90,7 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 
 define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -122,7 +122,7 @@ define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -154,7 +154,7 @@ define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -186,7 +186,7 @@ define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -222,7 +222,7 @@ define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 
 define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -254,7 +254,7 @@ define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 
 define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -286,7 +286,7 @@ define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 
 define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -318,7 +318,7 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 
 define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -350,7 +350,7 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 
 define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -406,7 +406,7 @@ define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 
 define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -462,7 +462,7 @@ define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 
 define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -518,7 +518,7 @@ define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 
 define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -578,7 +578,7 @@ define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 
 define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -634,7 +634,7 @@ define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -690,7 +690,7 @@ define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -746,7 +746,7 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 
 define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -802,7 +802,7 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 
 define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
@@ -906,7 +906,7 @@ define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
@@ -1010,7 +1010,7 @@ define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
@@ -1114,7 +1114,7 @@ define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 
 define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
diff --git a/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
new file mode 100644
index 000000000000..4c8748e220fd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+define float @dotf(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @dotf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+entry:
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecext1 = extractelement <4 x float> %y, i32 0
+  %mul = fmul fast float %vecext, %vecext1
+  %vecext.1 = extractelement <4 x float> %x, i32 1
+  %vecext1.1 = extractelement <4 x float> %y, i32 1
+  %mul.1 = fmul fast float %vecext.1, %vecext1.1
+  %add.1 = fadd fast float %mul.1, %mul
+  %vecext.2 = extractelement <4 x float> %x, i32 2
+  %vecext1.2 = extractelement <4 x float> %y, i32 2
+  %mul.2 = fmul fast float %vecext.2, %vecext1.2
+  %add.2 = fadd fast float %mul.2, %add.1
+  %vecext.3 = extractelement <4 x float> %x, i32 3
+  %vecext1.3 = extractelement <4 x float> %y, i32 3
+  %mul.3 = fmul fast float %vecext.3, %vecext1.3
+  %add.3 = fadd fast float %mul.3, %add.2
+  ret float %add.3
+}
+
+define double @dotd(<4 x double>* byval nocapture readonly align 32, <4 x double>* byval nocapture readonly align 32) {
+; CHECK-LABEL: @dotd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
+; CHECK-NEXT:    [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+entry:
+  %x = load <4 x double>, <4 x double>* %0, align 32
+  %y = load <4 x double>, <4 x double>* %1, align 32
+  %vecext = extractelement <4 x double> %x, i32 0
+  %vecext1 = extractelement <4 x double> %y, i32 0
+  %mul = fmul fast double %vecext, %vecext1
+  %vecext.1 = extractelement <4 x double> %x, i32 1
+  %vecext1.1 = extractelement <4 x double> %y, i32 1
+  %mul.1 = fmul fast double %vecext.1, %vecext1.1
+  %add.1 = fadd fast double %mul.1, %mul
+  %vecext.2 = extractelement <4 x double> %x, i32 2
+  %vecext1.2 = extractelement <4 x double> %y, i32 2
+  %mul.2 = fmul fast double %vecext.2, %vecext1.2
+  %add.2 = fadd fast double %mul.2, %add.1
+  %vecext.3 = extractelement <4 x double> %x, i32 3
+  %vecext1.3 = extractelement <4 x double> %y, i32 3
+  %mul.3 = fmul fast double %vecext.3, %vecext1.3
+  %add.3 = fadd fast double %mul.3, %add.2
+  ret double %add.3
+}
+
+define float @dotfq(<4 x float>* nocapture readonly %x, <4 x float>* nocapture readonly %y) {
+; CHECK-LABEL: @dotfq(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+entry:
+  %0 = load <4 x float>, <4 x float>* %x, align 16
+  %1 = load <4 x float>, <4 x float>* %y, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %vecext1 = extractelement <4 x float> %1, i32 0
+  %mul = fmul fast float %vecext1, %vecext
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %vecext1.1 = extractelement <4 x float> %1, i32 1
+  %mul.1 = fmul fast float %vecext1.1, %vecext.1
+  %add.1 = fadd fast float %mul.1, %mul
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %vecext1.2 = extractelement <4 x float> %1, i32 2
+  %mul.2 = fmul fast float %vecext1.2, %vecext.2
+  %add.2 = fadd fast float %mul.2, %add.1
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %vecext1.3 = extractelement <4 x float> %1, i32 3
+  %mul.3 = fmul fast float %vecext1.3, %vecext.3
+  %add.3 = fadd fast float %mul.3, %add.2
+  ret float %add.3
+}
+
+define double @dotdq(<4 x double>* nocapture readonly %x, <4 x double>* nocapture readonly %y) {
+; CHECK-LABEL: @dotdq(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+entry:
+  %0 = load <4 x double>, <4 x double>* %x, align 32
+  %1 = load <4 x double>, <4 x double>* %y, align 32
+  %vecext = extractelement <4 x double> %0, i32 0
+  %vecext1 = extractelement <4 x double> %1, i32 0
+  %mul = fmul fast double %vecext1, %vecext
+  %vecext.1 = extractelement <4 x double> %0, i32 1
+  %vecext1.1 = extractelement <4 x double> %1, i32 1
+  %mul.1 = fmul fast double %vecext1.1, %vecext.1
+  %add.1 = fadd fast double %mul.1, %mul
+  %vecext.2 = extractelement <4 x double> %0, i32 2
+  %vecext1.2 = extractelement <4 x double> %1, i32 2
+  %mul.2 = fmul fast double %vecext1.2, %vecext.2
+  %add.2 = fadd fast double %mul.2, %add.1
+  %vecext.3 = extractelement <4 x double> %0, i32 3
+  %vecext1.3 = extractelement <4 x double> %1, i32 3
+  %mul.3 = fmul fast double %vecext1.3, %vecext.3
+  %add.3 = fadd fast double %mul.3, %add.2
+  ret double %add.3
+}
diff --git a/test/Transforms/SROA/address-spaces.ll b/test/Transforms/SROA/address-spaces.ll
index 8fba30c2720f..a54a3afc79f9 100644
--- a/test/Transforms/SROA/address-spaces.ll
+++ b/test/Transforms/SROA/address-spaces.ll
@@ -101,3 +101,31 @@ entry:
    %ret = fadd float %f1, %f2
    ret float %ret
 }
+
+; Test load from and store to non-zero address space.
+define void @test_load_store_diff_addr_space([2 x float] addrspace(1)* %complex1, [2 x float] addrspace(1)* %complex2) {
+; CHECK-LABEL: @test_load_store_diff_addr_space
+; CHECK-NOT: alloca
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+  %a = alloca i64
+  %a.cast = bitcast i64* %a to [2 x float]*
+  %a.gep1 = getelementptr [2 x float], [2 x float]* %a.cast, i32 0, i32 0
+  %a.gep2 = getelementptr [2 x float], [2 x float]* %a.cast, i32 0, i32 1
+  %complex1.gep = getelementptr [2 x float], [2 x float] addrspace(1)* %complex1, i32 0, i32 0
+  %p1 = bitcast float addrspace(1)* %complex1.gep to i64 addrspace(1)*
+  %v1 = load i64, i64 addrspace(1)* %p1
+  store i64 %v1, i64* %a
+  %f1 = load float, float* %a.gep1
+  %f2 = load float, float* %a.gep2
+  %sum = fadd float %f1, %f2
+  store float %sum, float* %a.gep1
+  store float %sum, float* %a.gep2
+  %v2 = load i64, i64* %a
+  %complex2.gep = getelementptr [2 x float], [2 x float] addrspace(1)* %complex2, i32 0, i32 0
+  %p2 = bitcast float addrspace(1)* %complex2.gep to i64 addrspace(1)*
+  store i64 %v2, i64 addrspace(1)* %p2
+  ret void
+}
diff --git a/test/Transforms/SampleProfile/Inputs/indirect-call.prof b/test/Transforms/SampleProfile/Inputs/indirect-call.prof
index ff7be5df977a..ff1368142a0d 100644
--- a/test/Transforms/SampleProfile/Inputs/indirect-call.prof
+++ b/test/Transforms/SampleProfile/Inputs/indirect-call.prof
@@ -17,3 +17,6 @@ test_inline_strip:3000:0
 test_inline_strip_conflict:3000:0
  1: foo_inline_strip_conflict:3000
   1: 3000
+test_norecursive_inline:3000:0
+ 1: test_norecursive_inline:3000
+  20: 3000
diff --git a/test/Transforms/SampleProfile/indirect-call.ll b/test/Transforms/SampleProfile/indirect-call.ll
index 4101f6f492e5..bee98f1066d2 100644
--- a/test/Transforms/SampleProfile/indirect-call.ll
+++ b/test/Transforms/SampleProfile/indirect-call.ll
@@ -69,7 +69,18 @@ define void @test_noinline(void ()*) !dbg !12 {
   ret void
 }
 
+; CHECK-LABEL: @test_norecursive_inline
+; If the indirect call target is the caller, we should not promote it.
+define void @test_norecursive_inline() !dbg !24 {
+; CHECK-NOT: icmp
+; CHECK: call
+  %1 = load void ()*, void ()** @y, align 8
+  call void %1(), !dbg !25
+  ret void
+}
+
 @x = global i32 0, align 4
+@y = global void ()* null, align 8
 
 define i32* @foo_inline1(i32* %x) !dbg !14 {
   ret i32* %x
@@ -142,3 +153,5 @@ define void @test_direct() !dbg !22 {
 !21 = distinct !DISubprogram(name: "foo_direct", scope: !1, file: !1, line: 21, unit: !0)
 !22 = distinct !DISubprogram(name: "test_direct", scope: !1, file: !1, line: 22, unit: !0)
 !23 = !DILocation(line: 23, scope: !22)
+!24 = distinct !DISubprogram(name: "test_norecursive_inline", scope: !1, file: !1, line: 12, unit: !0)
+!25 = !DILocation(line: 13, scope: !24)
diff --git a/test/Transforms/Sink/badloadsink.ll b/test/Transforms/Sink/badloadsink.ll
new file mode 100644
index 000000000000..e3f4884c5a40
--- /dev/null
+++ b/test/Transforms/Sink/badloadsink.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -basicaa -sink -S | FileCheck %s
+declare void @foo(i64 *)
+define i64 @sinkload(i1 %cmp) {
+; CHECK-LABEL: @sinkload
+top:
+    %a = alloca i64
+; CHECK: call void @foo(i64* %a)
+; CHECK-NEXT: %x = load i64, i64* %a
+    call void @foo(i64* %a)
+    %x = load i64, i64* %a
+    br i1 %cmp, label %A, label %B
+A:
+    store i64 0, i64 *%a
+    br label %B
+B:
+; CHECK-NOT: load i64, i64 *%a
+    ret i64 %x
+}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split.ll b/test/Transforms/ThinLTOBitcodeWriter/split.ll
index d37d10bd3560..8bf3a18cd7f9 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/split.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/split.ll
@@ -25,6 +25,9 @@
 ; ERROR: llvm-modextract: error: module index out of range; bitcode file contains 2 module(s)
 
 ; BCA0: <GLOBALVAL_SUMMARY_BLOCK
+; BCA1: <FULL_LTO_GLOBALVAL_SUMMARY_BLOCK
+; 16 = not eligible to import
+; BCA1: <PERMODULE_GLOBALVAR_INIT_REFS {{.*}} op1=16
 ; BCA1-NOT: <GLOBALVAL_SUMMARY_BLOCK
 
 $g = comdat any
@@ -47,5 +50,6 @@ define i8* @f() {
 ; NODEBUG-NOT: !llvm.dbg.cu
 !llvm.dbg.cu = !{}
 
+; M1: !{i32 1, !"ThinLTO", i32 0}
 !1 = !{i32 2, !"Debug Info Version", i32 3}
 !llvm.module.flags = !{!1}
diff --git a/test/Transforms/Util/PredicateInfo/condprop2.ll b/test/Transforms/Util/PredicateInfo/condprop2.ll
index 415fa7c879e3..facd22f5b7a6 100644
--- a/test/Transforms/Util/PredicateInfo/condprop2.ll
+++ b/test/Transforms/Util/PredicateInfo/condprop2.ll
@@ -1,4 +1,4 @@
-; REQUIRES: asserts
+; REQUIRES: abi-breaking-checks
 ; NOTE: The flag -reverse-iterate is present only in a +Asserts build.
 ; Hence, this test has been split from condprop.ll to test with -reverse-iterate.
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
diff --git a/test/Transforms/Util/PredicateInfo/testandor2.ll b/test/Transforms/Util/PredicateInfo/testandor2.ll
index a03250c2f7a0..a1b9c62040c8 100644
--- a/test/Transforms/Util/PredicateInfo/testandor2.ll
+++ b/test/Transforms/Util/PredicateInfo/testandor2.ll
@@ -1,4 +1,4 @@
-; REQUIRES: asserts
+; REQUIRES: abi-breaking-checks
 ; NOTE: The flag -reverse-iterate is present only in a +Asserts build.
 ; Hence, this test has been split from testandor.ll to test with -reverse-iterate.
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py