34 files changed, 1794 insertions, 137 deletions
diff --git a/test/Transforms/CodeExtractor/live_shrink.ll b/test/Transforms/CodeExtractor/live_shrink.ll
new file mode 100644
index 0000000000000..c25ed2b622cdc
--- /dev/null
+++ b/test/Transforms/CodeExtractor/live_shrink.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -partial-inliner  -skip-partial-inlining-cost-analysis  < %s |   FileCheck %s
+; RUN: opt -S -passes=partial-inliner  -skip-partial-inlining-cost-analysis  < %s   | FileCheck %s
+
+%class.A = type { i32 }
+@cond = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: uwtable
+define void @_Z3foov() local_unnamed_addr  {
+bb:
+  %tmp = alloca %class.A, align 4
+  %tmp1 = bitcast %class.A* %tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1)
+  %tmp2 = load i32, i32* @cond, align 4, !tbaa !2
+  %tmp3 = icmp eq i32 %tmp2, 0
+  br i1 %tmp3, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) 
+
+declare void @_ZN1A7memfuncEv(%class.A*) local_unnamed_addr 
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) 
+
+; Function Attrs: uwtable
+define void @_Z3goov() local_unnamed_addr  {
+; CHECK-LABEL: @_Z3goov()
+bb:
+; CHECK: bb:
+; CHECK-NOT: alloca
+; CHECK-NOT: bitcast
+; CHECK-NOT: llvm.lifetime
+; CHECK: br i1
+; CHECK: codeRepl.i:
+; CHECK: call void @_Z3foov.1_
+
+  tail call void @_Z3foov()
+  ret void
+}
+
+; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK: newFuncRoot:
+; CHECK-NEXT:  %tmp = alloca %class.A
+; CHECK-NEXT:  %tmp1 = bitcast %class.A* %tmp to i8*
+; CHECK-NEXT:  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1)
+; CHECK-NEXT:  br label %bb5.exitStub
+
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
diff --git a/test/Transforms/CodeExtractor/live_shrink_gep.ll b/test/Transforms/CodeExtractor/live_shrink_gep.ll
new file mode 100644
index 0000000000000..ac6aa4fbda43b
--- /dev/null
+++ b/test/Transforms/CodeExtractor/live_shrink_gep.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -partial-inliner -skip-partial-inlining-cost-analysis  < %s   | FileCheck %s
+; RUN: opt -S -passes=partial-inliner  -skip-partial-inlining-cost-analysis < %s   | FileCheck %s
+
+%class.A = type { i8 }
+
+@cond = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: uwtable
+define void @_Z3foov() local_unnamed_addr  {
+bb:
+  %tmp = alloca %class.A, align 1
+  %tmp1 = getelementptr inbounds %class.A, %class.A* %tmp, i64 0, i32 0
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %tmp1) 
+  %tmp2 = load i32, i32* @cond, align 4, !tbaa !2
+  %tmp3 = icmp eq i32 %tmp2, 0
+  br i1 %tmp3, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %tmp1) 
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) 
+
+declare void @_ZN1A7memfuncEv(%class.A*) local_unnamed_addr 
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) 
+
+; Function Attrs: uwtable
+define void @_Z3goov() local_unnamed_addr  {
+; CHECK-LABEL: @_Z3goov()
+bb:
+; CHECK: bb:
+; CHECK-NOT: alloca
+; CHECK-NOT: getelementptr
+; CHECK-NOT: llvm.lifetime
+; CHECK: br i1
+; CHECK: codeRepl.i:
+; CHECK: call void @_Z3foov.1_
+  tail call void @_Z3foov()
+  ret void
+}
+
+; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK: newFuncRoot:
+; CHECK-NEXT:  %tmp = alloca %class.A
+; CHECK-NEXT:  %tmp1 = getelementptr
+; CHECK-NEXT:  call void @llvm.lifetime.start.p0i8
+; CHECK:  call void @llvm.lifetime.end.p0i8
+; CHECK-NEXT:  br label %bb5.exitStub
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
diff --git a/test/Transforms/CodeExtractor/live_shrink_hoist.ll b/test/Transforms/CodeExtractor/live_shrink_hoist.ll
new file mode 100644
index 0000000000000..d1b310f017694
--- /dev/null
+++ b/test/Transforms/CodeExtractor/live_shrink_hoist.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis  < %s |   FileCheck %s
+; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2  -skip-partial-inlining-cost-analysis < %s   | FileCheck %s
+
+%class.A = type { i32 }
+
+@cond = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: uwtable
+define void @_Z3foov() local_unnamed_addr  {
+bb:
+  %tmp = alloca %class.A, align 4
+  %tmp1 = bitcast %class.A* %tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) 
+  %tmp2 = load i32, i32* @cond, align 4, !tbaa !2
+  %tmp3 = icmp eq i32 %tmp2, 0
+  br i1 %tmp3, label %bb4, label %bb9
+
+bb4:                                              ; preds = %bb
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  %tmp5 = getelementptr inbounds %class.A, %class.A* %tmp, i64 0, i32 0
+  %tmp6 = load i32, i32* %tmp5, align 4, !tbaa !6
+  %tmp7 = icmp sgt i32 %tmp6, 0
+  br i1 %tmp7, label %bb9, label %bb8
+
+bb8:                                              ; preds = %bb4
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  br label %bb9
+
+bb9:                                              ; preds = %bb8, %bb4, %bb
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) 
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) 
+
+declare void @_ZN1A7memfuncEv(%class.A*) local_unnamed_addr 
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) 
+
+; Function Attrs: uwtable
+define void @_Z3goov() local_unnamed_addr  {
+bb:
+  tail call void @_Z3foov()
+  ret void
+}
+
+; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK: bb9:
+; CHECK: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1)
+; CHECK:  br label %.exitStub
+
+
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = !{!7, !3, i64 0}
+!7 = !{!"_ZTS1A", !3, i64 0}
diff --git a/test/Transforms/CodeExtractor/live_shrink_multiple.ll b/test/Transforms/CodeExtractor/live_shrink_multiple.ll
new file mode 100644
index 0000000000000..8d9045c7267b1
--- /dev/null
+++ b/test/Transforms/CodeExtractor/live_shrink_multiple.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -partial-inliner -skip-partial-inlining-cost-analysis < %s   | FileCheck %s
+; RUN: opt -S -passes=partial-inliner -skip-partial-inlining-cost-analysis < %s   | FileCheck %s
+
+%class.A = type { i32 }
+@cond = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: uwtable
+define void @_Z3foov() local_unnamed_addr  {
+bb:
+  %tmp = alloca %class.A, align 4
+  %tmp1 = alloca %class.A, align 4
+  %tmp2 = bitcast %class.A* %tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp2) 
+  %tmp3 = bitcast %class.A* %tmp1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp3) 
+  %tmp4 = load i32, i32* @cond, align 4, !tbaa !2
+  %tmp5 = icmp eq i32 %tmp4, 0
+  br i1 %tmp5, label %bb6, label %bb7
+
+bb6:                                              ; preds = %bb
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  br label %bb7
+
+bb7:                                              ; preds = %bb6, %bb
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp3) 
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp2) 
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) 
+
+declare void @_ZN1A7memfuncEv(%class.A*) local_unnamed_addr 
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) 
+
+; Function Attrs: uwtable
+define void @_Z3goov() local_unnamed_addr  {
+bb:
+  tail call void @_Z3foov()
+  ret void
+}
+
+; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK: newFuncRoot:
+; CHECK-NEXT:  alloca 
+; CHECK-NEXT:  bitcast 
+; CHECK-NEXT:  call void @llvm.lifetime.start.p0i8
+; CHECK-NEXT:  alloca
+; CHECK-NEXT:  bitcast 
+; CHECK-NEXT:  call void @llvm.lifetime.start.p0i8
+; CHECK:  call void @llvm.lifetime.end.p0i8
+; CHECK-NEXT:  call void @llvm.lifetime.end.p0i8
+; CHECK-NEXT:  br label {{.*}}exitStub
+
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
diff --git a/test/Transforms/CodeExtractor/live_shrink_unsafe.ll b/test/Transforms/CodeExtractor/live_shrink_unsafe.ll
new file mode 100644
index 0000000000000..ea6458cc46ec8
--- /dev/null
+++ b/test/Transforms/CodeExtractor/live_shrink_unsafe.ll
@@ -0,0 +1,94 @@
+; The expected behavior of this file is expected to change when partial
+; inlining legality check is enhanced.
+
+; RUN: opt -S -partial-inliner -skip-partial-inlining-cost-analysis  < %s   | FileCheck %s
+; RUN: opt -S -passes=partial-inliner -skip-partial-inlining-cost-analysis < %s |   FileCheck %s
+
+%class.A = type { i32 }
+
+@cond = local_unnamed_addr global i32 0, align 4
+@condptr = external local_unnamed_addr global i32*, align 8
+
+; Function Attrs: uwtable
+define void @_Z3foo_unknown_mem_accessv() local_unnamed_addr  {
+bb:
+  %tmp = alloca %class.A, align 4
+  %tmp1 = alloca %class.A, align 4
+  %tmp2 = bitcast %class.A* %tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp2) 
+  %tmp3 = bitcast %class.A* %tmp1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp3) 
+  %tmp4 = load i32*, i32** @condptr, align 8, !tbaa !2
+  %tmp5 = load i32, i32* %tmp4, align 4, !tbaa !6
+  %tmp6 = icmp eq i32 %tmp5, 0
+  br i1 %tmp6, label %bb7, label %bb8
+
+bb7:                                              ; preds = %bb
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp3) 
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp2) 
+  ret void
+}
+
+declare void @_Z3barv() local_unnamed_addr
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) 
+declare void @_ZN1A7memfuncEv(%class.A*) local_unnamed_addr 
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) 
+
+define void @_Z3foo_unknown_calli(i32 %arg) local_unnamed_addr {
+bb:
+  %tmp = alloca %class.A, align 4
+  %tmp1 = bitcast %class.A* %tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) 
+  tail call void @_Z3barv()
+  %tmp2 = icmp eq i32 %arg, 0
+  br i1 %tmp2, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb
+  call void @_ZN1A7memfuncEv(%class.A* nonnull %tmp)
+  br label %bb4
+
+bb4:                                              ; preds = %bb3, %bb
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) 
+  ret void
+}
+
+define void @_Z3goov() local_unnamed_addr  {
+; CHECK-LABEL: @_Z3goov
+; CHECK-NEXT: bb:
+; CHECK: alloca
+; CHECK: lifetime
+bb:
+  call void @_Z3foo_unknown_mem_accessv()
+  %tmp = load i32, i32* @cond, align 4, !tbaa !2
+  tail call void @_Z3foo_unknown_calli(i32 %tmp)
+  ret void
+}
+
+; CHECK-LABEL define internal void @_Z3foo_unknown_calli.1_bb3
+; CHECK: newFuncRoot:
+; CHECK-NEXT: br label %bb3
+
+; CHECK: bb4.exitStub:
+; CHECK-NEXT: ret void
+
+; CHECK: bb3:
+; CHECK-NOT: lifetime.ed
+; CHECK: br label %bb4.exitStub
+
+
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"any pointer", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"int", !4, i64 0}
diff --git a/test/Transforms/CrossDSOCFI/cfi_functions.ll b/test/Transforms/CrossDSOCFI/cfi_functions.ll
new file mode 100644
index 0000000000000..ccbde51b2115c
--- /dev/null
+++ b/test/Transforms/CrossDSOCFI/cfi_functions.ll
@@ -0,0 +1,23 @@
+; Test that types referenced in ThinLTO-style !cfi.functions are known to __cfi_check.
+; RUN: opt -S -cross-dso-cfi < %s | FileCheck %s
+; RUN: opt -S -passes=cross-dso-cfi < %s | FileCheck %s
+
+; CHECK:      define void @__cfi_check(
+; CHECK:        switch i64
+; CHECK-NEXT:     i64 1234, label
+; CHECK-NEXT:     i64 5678, label
+; CHECK-NEXT:   ]
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+!cfi.functions = !{!0, !1}
+!llvm.module.flags = !{!6}
+
+!0 = !{!"f", i8 0, !2, !4}
+!1 = !{!"g", i8 1, !3, !5}
+!2 = !{i64 0, !"typeid1"}
+!3 = !{i64 0, !"typeid2"}
+!4 = !{i64 0, i64 1234}
+!5 = !{i64 0, i64 5678}
+!6 = !{i32 4, !"Cross-DSO CFI", i32 1}
diff --git a/test/Transforms/EarlyCSE/pr33406.ll b/test/Transforms/EarlyCSE/pr33406.ll
new file mode 100644
index 0000000000000..4d3312e1f0ac2
--- /dev/null
+++ b/test/Transforms/EarlyCSE/pr33406.ll
@@ -0,0 +1,26 @@
+; RUN: opt -early-cse-memssa -S %s | FileCheck %s
+
+; CHECK: define void @patatino() {
+; CHECK:  for.cond:
+; CHECK-NEXT:  br i1 true, label %if.end, label %for.inc
+; CHECK:  if.end:
+; CHECK-NEXT:  %tinkywinky = load i32, i32* @b
+; CHECK-NEXT:  br i1 true, label %for.inc, label %for.inc
+; CHECK:  for.inc:
+; CHECK-NEXT:  ret void
+
+
+@b = external global i32
+
+define void @patatino() {
+for.cond:
+  br i1 true, label %if.end, label %for.inc
+
+if.end:
+  %tinkywinky = load i32, i32* @b
+  store i32 %tinkywinky, i32* @b
+  br i1 true, label %for.inc, label %for.inc
+
+for.inc:
+  ret void
+}
diff --git a/test/Transforms/GVN/pr32314.ll b/test/Transforms/GVN/pr32314.ll
new file mode 100644
index 0000000000000..90d14f6fc49c0
--- /dev/null
+++ b/test/Transforms/GVN/pr32314.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -gvn < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The load in the loop can not bypass the data from the previous loop. The store above it in the loop aliases.
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P_017:%.*]] = phi i32* [ undef, [[ENTRY]] ], [ [[ARRAYIDX3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* [[A]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    store i32 50, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[P_017]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX3]] = getelementptr inbounds [3 x i32], [3 x i32]* [[A]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 60, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
+;
+entry:
+  %a = alloca [3 x i32], align 4
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %p.017 = phi i32* [ undef, %entry ], [ %arrayidx3, %for.body ]
+  %0 = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %a, i64 0, i64 %0
+  store i32 50, i32* %arrayidx, align 4
+  %1 = shl i64 %indvars.iv, 1
+  %2 = load i32, i32* %p.017, align 4
+  %3 = trunc i64 %1 to i32
+  %add1 = add nsw i32 %2, %3
+  %arrayidx3 = getelementptr inbounds [3 x i32], [3 x i32]* %a, i64 0, i64 %indvars.iv
+  store i32 60, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 3
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
diff --git a/test/Transforms/GlobalMerge/debug-info.ll b/test/Transforms/GlobalMerge/debug-info.ll
index 97e0bb2148e93..8d60f3662431c 100644
--- a/test/Transforms/GlobalMerge/debug-info.ll
+++ b/test/Transforms/GlobalMerge/debug-info.ll
@@ -17,7 +17,7 @@ define void @use1() {
 ; CHECK: [[AVAR]] = !DIGlobalVariable(name: "a", scope: null, isLocal: false, isDefinition: true)
 ; CHECK: [[B]] = !DIGlobalVariableExpression(var: [[BVAR:![0-9]+]], expr: [[EXPR:![0-9]+]])
 ; CHECK: [[BVAR]] = !DIGlobalVariable(name: "b", scope: null, isLocal: false, isDefinition: true)
-; CHECK: [[EXPR]] = !DIExpression(DW_OP_plus, 4)
+; CHECK: [[EXPR]] = !DIExpression(DW_OP_plus_uconst, 4)
 
 !llvm.module.flags = !{!4, !5}
 
diff --git a/test/Transforms/Inline/always-inline.ll b/test/Transforms/Inline/always-inline.ll
index 5366b5a16cc77..791eb94779b70 100644
--- a/test/Transforms/Inline/always-inline.ll
+++ b/test/Transforms/Inline/always-inline.ll
@@ -305,3 +305,14 @@ entry:
   ret void
 ; CHECK: ret void
 }
+
+define void @inner14() readnone nounwind {
+; CHECK: define void @inner14
+  ret void
+}
+
+define void @outer14() {
+; CHECK: call void @inner14
+  call void @inner14()
+  ret void
+}
diff --git a/test/Transforms/InstCombine/debuginfo-dce.ll b/test/Transforms/InstCombine/debuginfo-dce.ll
index 086743e80820b..50b8f1c6068e1 100644
--- a/test/Transforms/InstCombine/debuginfo-dce.ll
+++ b/test/Transforms/InstCombine/debuginfo-dce.ll
@@ -93,12 +93,12 @@ entry:
   ret void, !dbg !32
 }
 
-; CHECK: ![[LOAD_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_plus, 0)
-; CHECK: ![[BITCAST_EXPR]] = !DIExpression(DW_OP_plus, 0)
-; CHECK: ![[GEP0_EXPR]] = !DIExpression(DW_OP_minus, 8, DW_OP_plus, 0, DW_OP_stack_value)
-; CHECK: ![[GEP1_EXPR]] = !DIExpression(DW_OP_minus, 8, DW_OP_stack_value,
+; CHECK: ![[LOAD_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 0)
+; CHECK: ![[BITCAST_EXPR]] = !DIExpression(DW_OP_plus_uconst, 0)
+; CHECK: ![[GEP0_EXPR]] = !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_plus_uconst, 0, DW_OP_stack_value)
+; CHECK: ![[GEP1_EXPR]] = !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_stack_value,
 ; CHECK-SAME:                           DW_OP_LLVM_fragment, 0, 32)
-; CHECK: ![[GEP2_EXPR]] = !DIExpression(DW_OP_minus, 8, DW_OP_stack_value)
+; CHECK: ![[GEP2_EXPR]] = !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_stack_value)
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
@@ -130,7 +130,7 @@ attributes #1 = { nounwind readnone }
 !17 = !{!18}
 !18 = !DILocalVariable(name: "entry", scope: !14, file: !1, line: 6, type: !4)
 !19 = !DILocation(line: 6, column: 17, scope: !14)
-!20 = !DIExpression(DW_OP_plus, 0)
+!20 = !DIExpression(DW_OP_plus_uconst, 0)
 !21 = !DILocation(line: 11, column: 1, scope: !14)
 !22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
 !23 = !DILocation(line: 6, column: 17, scope: !22)
diff --git a/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
index 107440f10a5a2..230ac1796671f 100644
--- a/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
+++ b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
@@ -1,10 +1,11 @@
 ; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
+; Temporarily an expected failure until inst combine is updated in the next patch
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; Test basic unfolding
-define void @test1(i8* %Src, i8* %Dst) {
-; CHECK-LABEL: test1
-; CHECK-NOT: llvm.memcpy.element.atomic
+; Test basic unfolding -- unordered load & store
+define void @test1a(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test1a
+; CHECK-NOT: llvm.memcpy.element.unordered.atomic
 
 ; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
 ; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
@@ -21,7 +22,7 @@ define void @test1(i8* %Src, i8* %Dst) {
 ; CHECK-DAG: [[VAL4:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
 ; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
 entry:
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 16, i32 4)
   ret void
 }
 
@@ -31,9 +32,9 @@ define void @test2(i8* %Src, i8* %Dst) {
 
 ; CHECK-NOT: load
 ; CHECK-NOT: store
-; CHECK: llvm.memcpy.element.atomic
+; CHECK: llvm.memcpy.element.unordered.atomic
 entry:
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 256, i32 4)
   ret void
 }
 
@@ -43,16 +44,16 @@ define void @test3(i8* %Src, i8* %Dst) {
 
 ; CHECK-NOT: load
 ; CHECK-NOT: store
-; CHECK: llvm.memcpy.element.atomic
+; CHECK: llvm.memcpy.element.unordered.atomic
 entry:
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 64, i32 64)
   ret void
 }
 
 ; Test that we will eliminate redundant bitcasts
 define void @test4(i64* %Src, i64* %Dst) {
 ; CHECK-LABEL: test4
-; CHECK-NOT: llvm.memcpy.element.atomic
+; CHECK-NOT: llvm.memcpy.element.unordered.atomic
 
 ; CHECK-NOT: bitcast
 
@@ -76,17 +77,18 @@ define void @test4(i64* %Src, i64* %Dst) {
 entry:
   %Src.casted = bitcast i64* %Src to i8*
   %Dst.casted = bitcast i64* %Dst to i8*
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i32 32, i32 8)
   ret void
 }
 
+; Test that 0-length unordered atomic memcpy gets removed.
 define void @test5(i8* %Src, i8* %Dst) {
 ; CHECK-LABEL: test5
 
-; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+; CHECK-NOT: llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8)
 entry:
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8)
   ret void
 }
 
-declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
diff --git a/test/Transforms/InstCombine/ffs-1.ll b/test/Transforms/InstCombine/ffs-1.ll
index d27fb5d89f092..af4ee85216ef2 100644
--- a/test/Transforms/InstCombine/ffs-1.ll
+++ b/test/Transforms/InstCombine/ffs-1.ll
@@ -1,12 +1,12 @@
 ; Test that the ffs* library call simplifier works correctly.
 ;
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple i386-pc-linux -instcombine -S | FileCheck %s -check-prefix=CHECK-FFS
-; RUN: opt -instcombine -mtriple=arm64-apple-ios9.0 -S %s | FileCheck --check-prefix=CHECK-FFS %s
-; RUN: opt -instcombine -mtriple=arm64-apple-tvos9.0 -S %s | FileCheck --check-prefix=CHECK-FFS %s
-; RUN: opt -instcombine -mtriple=thumbv7k-apple-watchos2.0 -S %s | FileCheck --check-prefix=CHECK-FFS %s
-; RUN: opt -instcombine -mtriple=x86_64-apple-macosx10.11 -S %s | FileCheck --check-prefix=CHECK-FFS %s
-; RUN: opt -instcombine -mtriple=x86_64-freebsd-gnu -S %s | FileCheck --check-prefix=CHECK-FFS %s
+; RUN: opt < %s -instcombine -S                                    | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC
+; RUN: opt < %s -instcombine -mtriple i386-pc-linux -S             | FileCheck %s --check-prefix=ALL --check-prefix=TARGET
+; RUN: opt < %s -instcombine -mtriple=arm64-apple-ios9.0 -S        | FileCheck %s --check-prefix=ALL --check-prefix=TARGET
+; RUN: opt < %s -instcombine -mtriple=arm64-apple-tvos9.0 -S       | FileCheck %s --check-prefix=ALL --check-prefix=TARGET
+; RUN: opt < %s -instcombine -mtriple=thumbv7k-apple-watchos2.0 -S | FileCheck %s --check-prefix=ALL --check-prefix=TARGET
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx10.11 -S  | FileCheck %s --check-prefix=ALL --check-prefix=TARGET
+; RUN: opt < %s -instcombine -mtriple=x86_64-freebsd-gnu -S        | FileCheck %s --check-prefix=ALL --check-prefix=TARGET
 
 declare i32 @ffs(i32)
 declare i32 @ffsl(i32)
@@ -15,123 +15,179 @@ declare i32 @ffsll(i64)
 ; Check ffs(0) -> 0.
 
 define i32 @test_simplify1() {
-; CHECK-LABEL: @test_simplify1(
+; ALL-LABEL: @test_simplify1(
+; ALL-NEXT:    ret i32 0
+;
   %ret = call i32 @ffs(i32 0)
   ret i32 %ret
-; CHECK-NEXT: ret i32 0
 }
 
 define i32 @test_simplify2() {
-; CHECK-FFS-LABEL: @test_simplify2(
+; GENERIC-LABEL: @test_simplify2(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsl(i32 0)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify2(
+; TARGET-NEXT:    ret i32 0
+;
   %ret = call i32 @ffsl(i32 0)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 0
 }
 
 define i32 @test_simplify3() {
-; CHECK-FFS-LABEL: @test_simplify3(
+; GENERIC-LABEL: @test_simplify3(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 0)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify3(
+; TARGET-NEXT:    ret i32 0
+;
   %ret = call i32 @ffsll(i64 0)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 0
 }
 
 ; Check ffs(c) -> cttz(c) + 1, where 'c' is a constant.
 
 define i32 @test_simplify4() {
-; CHECK-LABEL: @test_simplify4(
+; ALL-LABEL: @test_simplify4(
+; ALL-NEXT:    ret i32 1
+;
   %ret = call i32 @ffs(i32 1)
   ret i32 %ret
-; CHECK-NEXT: ret i32 1
 }
 
 define i32 @test_simplify5() {
-; CHECK-LABEL: @test_simplify5(
+; ALL-LABEL: @test_simplify5(
+; ALL-NEXT:    ret i32 12
+;
   %ret = call i32 @ffs(i32 2048)
   ret i32 %ret
-; CHECK-NEXT: ret i32 12
 }
 
 define i32 @test_simplify6() {
-; CHECK-LABEL: @test_simplify6(
+; ALL-LABEL: @test_simplify6(
+; ALL-NEXT:    ret i32 17
+;
   %ret = call i32 @ffs(i32 65536)
   ret i32 %ret
-; CHECK-NEXT: ret i32 17
 }
 
 define i32 @test_simplify7() {
-; CHECK-FFS-LABEL: @test_simplify7(
+; GENERIC-LABEL: @test_simplify7(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsl(i32 65536)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify7(
+; TARGET-NEXT:    ret i32 17
+;
   %ret = call i32 @ffsl(i32 65536)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 17
 }
 
 define i32 @test_simplify8() {
-; CHECK-FFS-LABEL: @test_simplify8(
+; GENERIC-LABEL: @test_simplify8(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 1024)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify8(
+; TARGET-NEXT:    ret i32 11
+;
   %ret = call i32 @ffsll(i64 1024)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 11
 }
 
 define i32 @test_simplify9() {
-; CHECK-FFS-LABEL: @test_simplify9(
+; GENERIC-LABEL: @test_simplify9(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 65536)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify9(
+; TARGET-NEXT:    ret i32 17
+;
   %ret = call i32 @ffsll(i64 65536)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 17
 }
 
 define i32 @test_simplify10() {
-; CHECK-FFS-LABEL: @test_simplify10(
+; GENERIC-LABEL: @test_simplify10(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 17179869184)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify10(
+; TARGET-NEXT:    ret i32 35
+;
   %ret = call i32 @ffsll(i64 17179869184)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 35
 }
 
 define i32 @test_simplify11() {
-; CHECK-FFS-LABEL: @test_simplify11(
+; GENERIC-LABEL: @test_simplify11(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 281474976710656)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify11(
+; TARGET-NEXT:    ret i32 49
+;
   %ret = call i32 @ffsll(i64 281474976710656)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 49
 }
 
 define i32 @test_simplify12() {
-; CHECK-FFS-LABEL: @test_simplify12(
+; GENERIC-LABEL: @test_simplify12(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 1152921504606846976)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify12(
+; TARGET-NEXT:    ret i32 61
+;
   %ret = call i32 @ffsll(i64 1152921504606846976)
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 61
 }
 
 ; Check ffs(x) -> x != 0 ? (i32)llvm.cttz(x) + 1 : 0.
 
 define i32 @test_simplify13(i32 %x) {
-; CHECK-LABEL: @test_simplify13(
+; ALL-LABEL: @test_simplify13(
+; ALL-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 %x, i1 true)
+; ALL-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[CTTZ]], 1
+; ALL-NEXT:    [[TMP2:%.*]] = icmp ne i32 %x, 0
+; ALL-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 0
+; ALL-NEXT:    ret i32 [[TMP3]]
+;
   %ret = call i32 @ffs(i32 %x)
-; CHECK-NEXT: [[CTTZ:%[a-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %x, i1 true)
-; CHECK-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i32 [[CTTZ]], 1
-; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %x, 0
-; CHECK-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[INC]], i32 0
   ret i32 %ret
-; CHECK-NEXT: ret i32 [[RET]]
 }
 
 define i32 @test_simplify14(i32 %x) {
-; CHECK-FFS-LABEL: @test_simplify14(
+; GENERIC-LABEL: @test_simplify14(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsl(i32 %x)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify14(
+; TARGET-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 %x, i1 true)
+; TARGET-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[CTTZ]], 1
+; TARGET-NEXT:    [[TMP2:%.*]] = icmp ne i32 %x, 0
+; TARGET-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 0
+; TARGET-NEXT:    ret i32 [[TMP3]]
+;
   %ret = call i32 @ffsl(i32 %x)
-; CHECK-FFS-NEXT: [[CTTZ:%[a-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %x, i1 true)
-; CHECK-FFS-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i32 [[CTTZ]], 1
-; CHECK-FFS-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %x, 0
-; CHECK-FFS-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[INC]], i32 0
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 [[RET]]
 }
 
 define i32 @test_simplify15(i64 %x) {
-; CHECK-FFS-LABEL: @test_simplify15(
+; GENERIC-LABEL: @test_simplify15(
+; GENERIC-NEXT:    [[RET:%.*]] = call i32 @ffsll(i64 %x)
+; GENERIC-NEXT:    ret i32 [[RET]]
+;
+; TARGET-LABEL: @test_simplify15(
+; TARGET-NEXT:    [[CTTZ:%.*]] = call i64 @llvm.cttz.i64(i64 %x, i1 true)
+; TARGET-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[CTTZ]], 1
+; TARGET-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; TARGET-NEXT:    [[TMP3:%.*]] = icmp ne i64 %x, 0
+; TARGET-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 0
+; TARGET-NEXT:    ret i32 [[TMP4]]
+;
   %ret = call i32 @ffsll(i64 %x)
-; CHECK-FFS-NEXT: [[CTTZ:%[a-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %x, i1 true)
-; CHECK-FFS-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i64 [[CTTZ]], 1
-; CHECK-FFS-NEXT: [[TRUNC:%[a-z0-9]+]] = trunc i64 [[INC]] to i32
-; CHECK-FFS-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i64 %x, 0
-; CHECK-FFS-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[TRUNC]], i32 0
   ret i32 %ret
-; CHECK-FFS-NEXT: ret i32 [[RET]]
 }
+
diff --git a/test/Transforms/InstCombine/lshr.ll b/test/Transforms/InstCombine/lshr.ll
index 71b25177162b9..4cdcb98f730c1 100644
--- a/test/Transforms/InstCombine/lshr.ll
+++ b/test/Transforms/InstCombine/lshr.ll
@@ -122,10 +122,19 @@ define <2 x i8> @bool_zext_splat(<2 x i1> %x) {
   ret <2 x i8> %hibit
 }
 
-; FIXME: The replicated sign bits are all that's left. This could be ashr+zext.
-
-define i16 @smear_sign_and_widen(i4 %x) {
+define i32 @smear_sign_and_widen(i8 %x) {
 ; CHECK-LABEL: @smear_sign_and_widen(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i8 %x, 7
+; CHECK-NEXT:    [[HIBIT:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[HIBIT]]
+;
+  %sext = sext i8 %x to i32
+  %hibit = lshr i32 %sext, 24
+  ret i32 %hibit
+}
+
+define i16 @smear_sign_and_widen_should_not_change_type(i4 %x) {
+; CHECK-LABEL: @smear_sign_and_widen_should_not_change_type(
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext i4 %x to i16
 ; CHECK-NEXT:    [[HIBIT:%.*]] = lshr i16 [[SEXT]], 12
 ; CHECK-NEXT:    ret i16 [[HIBIT]]
@@ -137,8 +146,8 @@ define i16 @smear_sign_and_widen(i4 %x) {
 
 define <2 x i8> @smear_sign_and_widen_splat(<2 x i6> %x) {
 ; CHECK-LABEL: @smear_sign_and_widen_splat(
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i6> %x to <2 x i8>
-; CHECK-NEXT:    [[HIBIT:%.*]] = lshr <2 x i8> [[SEXT]], <i8 2, i8 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <2 x i6> %x, <i6 2, i6 2>
+; CHECK-NEXT:    [[HIBIT:%.*]] = zext <2 x i6> [[TMP1]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[HIBIT]]
 ;
   %sext = sext <2 x i6> %x to <2 x i8>
diff --git a/test/Transforms/InstCombine/onehot_merge.ll b/test/Transforms/InstCombine/onehot_merge.ll
index 496d847b5321e..47a4ca4b628bf 100644
--- a/test/Transforms/InstCombine/onehot_merge.ll
+++ b/test/Transforms/InstCombine/onehot_merge.ll
@@ -33,3 +33,79 @@ bb:
   ret i1 %or
 }
 
+; Same as above but with operands commuted one of the ands, but not the other.
+define i1 @foo1_and_commuted(i32 %k, i32 %c1, i32 %c2) {
+; CHECK-LABEL: @foo1_and_commuted(
+; CHECK-NEXT:    [[K2:%.*]] = mul i32 [[K:%.*]], [[K]]
+; CHECK-NEXT:    [[TMP:%.*]] = shl i32 1, [[C1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 -2147483648, [[C2:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[TMP]], [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[K2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %k2 = mul i32 %k, %k ; to trick the complexity sorting
+  %tmp = shl i32 1, %c1
+  %tmp4 = lshr i32 -2147483648, %c2
+  %tmp1 = and i32 %k2, %tmp
+  %tmp2 = icmp eq i32 %tmp1, 0
+  %tmp5 = and i32 %tmp4, %k2
+  %tmp6 = icmp eq i32 %tmp5, 0
+  %or = or i1 %tmp2, %tmp6
+  ret i1 %or
+}
+
+define i1 @or_consts(i32 %k, i32 %c1, i32 %c2) {
+; CHECK-LABEL: @or_consts(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[K:%.*]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 12
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %tmp1 = and i32 4, %k
+  %tmp2 = icmp ne i32 %tmp1, 0
+  %tmp5 = and i32 8, %k
+  %tmp6 = icmp ne i32 %tmp5, 0
+  %or = and i1 %tmp2, %tmp6
+  ret i1 %or
+}
+
+define i1 @foo1_or(i32 %k, i32 %c1, i32 %c2) {
+; CHECK-LABEL: @foo1_or(
+; CHECK-NEXT:    [[TMP:%.*]] = shl i32 1, [[C1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 -2147483648, [[C2:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %tmp = shl i32 1, %c1
+  %tmp4 = lshr i32 -2147483648, %c2
+  %tmp1 = and i32 %tmp, %k
+  %tmp2 = icmp ne i32 %tmp1, 0
+  %tmp5 = and i32 %tmp4, %k
+  %tmp6 = icmp ne i32 %tmp5, 0
+  %or = and i1 %tmp2, %tmp6
+  ret i1 %or
+}
+
+; Same as above but with operands commuted one of the ors, but not the other.
+define i1 @foo1_or_commuted(i32 %k, i32 %c1, i32 %c2) {
+; CHECK-LABEL: @foo1_or_commuted(
+; CHECK-NEXT:    [[K2:%.*]] = mul i32 [[K:%.*]], [[K]]
+; CHECK-NEXT:    [[TMP:%.*]] = shl i32 1, [[C1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 -2147483648, [[C2:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[K2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %k2 = mul i32 %k, %k ; to trick the complexity sorting
+  %tmp = shl i32 1, %c1
+  %tmp4 = lshr i32 -2147483648, %c2
+  %tmp1 = and i32 %k2, %tmp
+  %tmp2 = icmp ne i32 %tmp1, 0
+  %tmp5 = and i32 %tmp4, %k2
+  %tmp6 = icmp ne i32 %tmp5, 0
+  %or = and i1 %tmp2, %tmp6
+  ret i1 %or
+}
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index f2bc290d79a45..485f9612376ad 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -114,6 +114,17 @@ define i32 @test10(i32 %A, i32 %B) {
   ret i32 %or
 }
 
+define i32 @test10_commuted(i32 %A, i32 %B) {
+; CHECK-LABEL: @test10_commuted(
+; CHECK-NEXT:    ret i32 -1
+;
+  %xor1 = xor i32 %B, %A
+  %not = xor i32 %A, -1
+  %xor2 = xor i32 %not, %B
+  %or = or i32 %xor2, %xor1
+  ret i32 %or
+}
+
 ; (x | y) & ((~x) ^ y) -> (x & y)
 define i32 @test11(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test11(
@@ -300,3 +311,36 @@ define i8 @or_xor_or(i8 %x) {
   ret i8 %or2
 }
 
+define i8 @test17(i8 %A, i8 %B) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i8 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A]], 33
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[NOT]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[XOR1]], [[XOR2]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i8 [[OR]], [[XOR2]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %xor1 = xor i8 %B, %A
+  %not = xor i8 %A, 33
+  %xor2 = xor i8 %not, %B
+  %or = or i8 %xor1, %xor2
+  %res = mul i8 %or, %xor2 ; to increase the use count for the xor
+  ret i8 %res
+}
+
+define i8 @test18(i8 %A, i8 %B) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i8 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A]], 33
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[NOT]], [[B]]
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[XOR2]], [[XOR1]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i8 [[OR]], [[XOR2]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %xor1 = xor i8 %B, %A
+  %not = xor i8 %A, 33
+  %xor2 = xor i8 %not, %B
+  %or = or i8 %xor2, %xor1
+  %res = mul i8 %or, %xor2 ; to increase the use count for the xor
+  ret i8 %res
+}
diff --git a/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/test/Transforms/InstCombine/select-with-bitwise-ops.ll
index 68b73af21a8d6..faeb4e046aca8 100644
--- a/test/Transforms/InstCombine/select-with-bitwise-ops.ll
+++ b/test/Transforms/InstCombine/select-with-bitwise-ops.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
+target datalayout = "n8:16:32:64"
+
 define i32 @select_icmp_eq_and_1_0_or_2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2(
 ; CHECK-NEXT:    [[AND:%.*]] = shl i32 %x, 1
@@ -295,3 +297,269 @@ define i32 @test67(i16 %x) {
   ret i32 %3
 }
 
+define i32 @test68(i32 %x, i32 %y) {
+; CHECK-LABEL: @test68(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[TMP1]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[Y]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[SELECT]]
+;
+  %and = and i32 %x, 128
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 2
+  %select = select i1 %cmp, i32 %y, i32 %or
+  ret i32 %select
+}
+
+define i32 @test69(i32 %x, i32 %y) {
+; CHECK-LABEL: @test69(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[Y]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[SELECT]]
+;
+  %and = and i32 %x, 128
+  %cmp = icmp ne i32 %and, 0
+  %or = or i32 %y, 2
+  %select = select i1 %cmp, i32 %y, i32 %or
+  ret i32 %select
+}
+
+define i32 @shift_no_xor_multiuse_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @shift_no_xor_multiuse_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[AND:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[AND]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP2]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 1
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 2
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %res = mul i32 %select, %or ; to bump up use count of the Or
+  ret i32 %res
+}
+
+define i32 @no_shift_no_xor_multiuse_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @no_shift_no_xor_multiuse_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 4096
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND]], [[Y]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP1]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 4096
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %res = mul i32 %select, %or ; to bump up use count of the Or
+  ret i32 %res
+}
+
+define i32 @no_shift_xor_multiuse_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @no_shift_xor_multiuse_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 4096
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[AND]], 4096
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP2]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp ne i32 0, %and
+  %or = or i32 %y, 4096
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %res = mul i32 %select, %or ; to bump up use count of the Or
+  ret i32 %res
+}
+
+; TODO this increased the number of instructions
+define i32 @shift_xor_multiuse_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @shift_xor_multiuse_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 2048
+; CHECK-NEXT:    [[AND:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[AND]], 2048
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 2048
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], [[Y]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP3]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp ne i32 0, %and
+  %or = or i32 %y, 2048
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %res = mul i32 %select, %or ; to bump up use count of the Or
+  ret i32 %res
+}
+
+define i32 @shift_no_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @shift_no_xor_multiuse_cmp(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[AND]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP2]], [[SELECT2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 1
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 2
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  ret i32 %res
+}
+
+define i32 @no_shift_no_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @no_shift_no_xor_multiuse_cmp(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND]], [[Y:%.*]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP1]], [[SELECT2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 4096
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  ret i32 %res
+}
+
+define i32 @no_shift_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @no_shift_xor_multiuse_cmp(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[AND]], 4096
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP2]], [[SELECT2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp ne i32 0, %and
+  %or = or i32 %y, 4096
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  ret i32 %res
+}
+
+; TODO this increased the number of instructions
+define i32 @shift_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @shift_xor_multiuse_cmp(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[AND]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 2048
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP3]], [[SELECT2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp ne i32 0, %and
+  %or = or i32 %y, 2048
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  ret i32 %res
+}
+
+; TODO this increased the number of instructions
+define i32 @shift_no_xor_multiuse_cmp_or(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @shift_no_xor_multiuse_cmp_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[AND]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP2]], [[SELECT2]]
+; CHECK-NEXT:    [[RES2:%.*]] = mul i32 [[RES]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %and = and i32 %x, 1
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 2
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  %res2 = mul i32 %res, %or ; to bump up the use count of the or
+  ret i32 %res2
+}
+
+define i32 @no_shift_no_xor_multiuse_cmp_or(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @no_shift_no_xor_multiuse_cmp_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 4096
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND]], [[Y]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP1]], [[SELECT2]]
+; CHECK-NEXT:    [[RES2:%.*]] = mul i32 [[RES]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, 4096
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  %res2 = mul i32 %res, %or ; to bump up the use count of the or
+  ret i32 %res2
+}
+
+; TODO this increased the number of instructions
+define i32 @no_shift_xor_multiuse_cmp_or(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @no_shift_xor_multiuse_cmp_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 4096
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[AND]], 4096
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP2]], [[SELECT2]]
+; CHECK-NEXT:    [[RES2:%.*]] = mul i32 [[RES]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp ne i32 0, %and
+  %or = or i32 %y, 4096
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  %res2 = mul i32 %res, %or ; to bump up the use count of the or
+  ret i32 %res2
+}
+
+; TODO this increased the number of instructions
+define i32 @shift_xor_multiuse_cmp_or(i32 %x, i32 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: @shift_xor_multiuse_cmp_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 4096
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], 2048
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[AND]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 2048
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], [[Y]]
+; CHECK-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[TMP3]], [[SELECT2]]
+; CHECK-NEXT:    [[RES2:%.*]] = mul i32 [[RES]], [[OR]]
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %and = and i32 %x, 4096
+  %cmp = icmp ne i32 0, %and
+  %or = or i32 %y, 2048
+  %select = select i1 %cmp, i32 %y, i32 %or
+  %select2 = select i1 %cmp, i32 %z, i32 %w ; to bump up use count of the cmp
+  %res = mul i32 %select, %select2
+  %res2 = mul i32 %res, %or ; to bump up the use count of the or
+  ret i32 %res2
+}
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index ce8e2fcd38b9b..68bbf35d1e65a 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -1306,3 +1306,13 @@ define <2 x i8> @lshr_demanded_bits_splat(<2 x i8> %x) {
   ret <2 x i8> %shr
 }
 
+; Make sure known bits works correctly with non power of 2 bit widths.
+define i7 @test65(i7 %a, i7 %b) {
+; CHECK-LABEL: @test65(
+; CHECK-NEXT:    ret i7 0
+;
+  %shiftamt = and i7 %b, 6 ; this ensures the shift amount is even and less than the bit width.
+  %x = lshr i7 42, %shiftamt ; 42 has a zero in every even numbered bit and a one in every odd bit.
+  %y = and i7 %x, 1 ; this extracts the lsb which should be 0 because we shifted an even number of bits and all even bits of the shift input are 0.
+  ret i7 %y
+}
diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll
index 3afbf632f6e19..49e6b999fbce2 100644
--- a/test/Transforms/InstCombine/xor2.ll
+++ b/test/Transforms/InstCombine/xor2.ll
@@ -325,3 +325,36 @@ define i32 @test14(i32 %a, i32 %b, i32 %c) {
   ret i32 %xor
 }
 
+define i8 @test15(i8 %A, i8 %B) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i8 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A]], 33
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[NOT]], [[B]]
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[XOR1]], [[XOR2]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i8 [[AND]], [[XOR2]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %xor1 = xor i8 %B, %A
+  %not = xor i8 %A, 33
+  %xor2 = xor i8 %not, %B
+  %and = and i8 %xor1, %xor2
+  %res = mul i8 %and, %xor2 ; to increase the use count for the xor
+  ret i8 %res
+}
+
+define i8 @test16(i8 %A, i8 %B) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i8 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A]], 33
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[NOT]], [[B]]
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[XOR2]], [[XOR1]]
+; CHECK-NEXT:    [[RES:%.*]] = mul i8 [[AND]], [[XOR2]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %xor1 = xor i8 %B, %A
+  %not = xor i8 %A, 33
+  %xor2 = xor i8 %not, %B
+  %and = and i8 %xor2, %xor1
+  %res = mul i8 %and, %xor2 ; to increase the use count for the xor
+  ret i8 %res
+}
diff --git a/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll b/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll
index ec93847178b58..d52378b864ff9 100644
--- a/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll
+++ b/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;; memcpy.atomic formation (atomic load & store)
 define void @test1(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test1(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -30,7 +30,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation (atomic store, normal load)
 define void @test2(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test2(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -55,7 +55,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (atomic store, normal load w/ no align)
 define void @test2b(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test2b(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -80,7 +80,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (atomic store, normal load w/ bad align)
 define void @test2c(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test2c(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -105,7 +105,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (atomic store w/ bad align, normal load)
 define void @test2d(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test2d(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -131,7 +131,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation (normal store, atomic load)
 define void @test3(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test3(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -156,7 +156,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (normal store w/ no align, atomic load)
 define void @test3b(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test3b(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -181,7 +181,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (normal store, atomic load w/ bad align)
 define void @test3c(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test3c(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -206,7 +206,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (normal store w/ bad align, atomic load)
 define void @test3d(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test3d(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -232,7 +232,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (atomic load, ordered-atomic store)
 define void @test4(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test4(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -257,7 +257,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (ordered-atomic load, unordered-atomic store)
 define void @test5(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test5(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
@@ -282,7 +282,8 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation (atomic load & store) -- element size 2
 define void @test6(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test6(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 2 %Dest{{[0-9]*}}, i8* align 2 %Base{{[0-9]*}}, i64 %Size, i32 2)
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 1
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 2 %Dest{{[0-9]*}}, i8* align 2 %Base{{[0-9]*}}, i64 [[Sz]], i32 2)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -307,7 +308,8 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation (atomic load & store) -- element size 4
 define void @test7(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test7(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dest{{[0-9]*}}, i8* align 4 %Base{{[0-9]*}}, i64 %Size, i32 4)
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 2
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %Dest{{[0-9]*}}, i8* align 4 %Base{{[0-9]*}}, i64 [[Sz]], i32 4)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -332,7 +334,8 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation (atomic load & store) -- element size 8
 define void @test8(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test8(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 8 %Dest{{[0-9]*}}, i8* align 8 %Base{{[0-9]*}}, i64 %Size, i32 8)
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 3
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %Dest{{[0-9]*}}, i8* align 8 %Base{{[0-9]*}}, i64 [[Sz]], i32 8)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -357,7 +360,8 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (atomic load & store) -- element size 16
 define void @test9(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test9(
-; CHECK: call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dest{{[0-9]*}}, i8* align 16 %Base{{[0-9]*}}, i64 %Size, i32 16)
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 4
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 %Dest{{[0-9]*}}, i8* align 16 %Base{{[0-9]*}}, i64 [[Sz]], i32 16)
 ; CHECK-NOT: store
 ; CHECK: ret void
 bb.nph:
@@ -382,7 +386,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;; memcpy.atomic formation rejection (atomic load & store) -- element size 32
 define void @test10(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test10(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
diff --git a/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll b/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll
index b2528f1c24577..341a7a0baebf0 100644
--- a/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll
+++ b/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;;  Will not create call due to a max element size of 0
 define void @test1(i64 %Size) nounwind ssp {
 ; CHECK-LABEL: @test1(
-; CHECK-NOT: call void @llvm.memcpy.element.atomic
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
 ; CHECK: store
 ; CHECK: ret void
 bb.nph:
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml b/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml
new file mode 100644
index 0000000000000..17b634acd0e1a
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml
@@ -0,0 +1,19 @@
+---
+TypeIdMap:
+  typeid1:
+    TTRes:
+      Kind:            AllOnes
+      SizeM1BitWidth:  7
+  typeid2:
+    TTRes:
+      Kind:            Single
+      SizeM1BitWidth:  0
+WithGlobalValueDeadStripping: false
+CfiFunctionDefs:
+  - local_a
+  - local_b
+  - does_not_exist
+CfiFunctionDecls:
+  - external
+  - external_weak
+...
diff --git a/test/Transforms/LowerTypeTests/export-icall.ll b/test/Transforms/LowerTypeTests/export-icall.ll
new file mode 100644
index 0000000000000..ad36048993067
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-icall.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-typeid2.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @h(i8 %x) !type !2 {
+  ret void
+}
+
+declare !type !8 void @f(i32 %x)
+
+!cfi.functions = !{!0, !1, !3, !4, !5, !6}
+
+; declaration of @h with a different type is ignored
+!0 = !{!"h", i8 1, !7}
+
+; extern_weak declaration of @h with a different type is ignored as well
+!1 = !{!"h", i8 2, !8}
+!2 = !{i64 0, !"typeid1"}
+
+; definition of @f replaces types on the IR declaration above
+!3 = !{!"f", i8 0, !2}
+!4 = !{!"external", i8 1, !2}
+!5 = !{!"external_weak", i8 2, !2}
+!6 = !{!"g", i8 0, !7}
+!7 = !{i64 0, !"typeid2"}
+!8 = !{i64 0, !"typeid3"}
+
+
+; CHECK-DAG: @__typeid_typeid1_global_addr = hidden alias i8, bitcast (void ()* [[JT1:.*]] to i8*)
+; CHECK-DAG: @__typeid_typeid1_align = hidden alias i8, inttoptr (i8 3 to i8*)
+; CHECK-DAG: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 3 to i8*)
+
+; CHECK-DAG: @h                    = alias void (i8), bitcast (void ()* [[JT1]] to void (i8)*)
+; CHECK-DAG: @f                    = alias void (i32), {{.*}}getelementptr {{.*}}void ()* [[JT1]]
+; CHECK-DAG: @external.cfi_jt      = hidden alias void (), {{.*}}getelementptr {{.*}}void ()* [[JT1]]
+; CHECK-DAG: @external_weak.cfi_jt = hidden alias void (), {{.*}}getelementptr {{.*}}void ()* [[JT1]]
+
+; CHECK-DAG: @__typeid_typeid2_global_addr = hidden alias i8, bitcast (void ()* [[JT2:.*]] to i8*)
+
+; CHECK-DAG: @g                    = alias void (), void ()* [[JT2]]
+
+; CHECK-DAG: define internal void @h.cfi(i8 {{.*}}) !type !{{.*}}
+; CHECK-DAG: declare !type !{{.*}} void @external()
+; CHECK-DAG: declare !type !{{.*}} void @external_weak()
+; CHECK-DAG: declare !type !{{.*}} void @f.cfi(i32)
+; CHECK-DAG: declare !type !{{.*}} void @g.cfi()
+
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            AllOnes
+; SUMMARY-NEXT:       SizeM1BitWidth:  7
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:   typeid2:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Single
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+
+; SUMMARY:      CfiFunctionDefs:
+; SUMMARY-NEXT:   - f
+; SUMMARY-NEXT:   - g
+; SUMMARY-NEXT:   - h
+; SUMMARY-NEXT: CfiFunctionDecls:
+; SUMMARY-NEXT:   - external
+; SUMMARY-NEXT:   - external_weak
+; SUMMARY-NEXT: ...
diff --git a/test/Transforms/LowerTypeTests/import-icall.ll b/test/Transforms/LowerTypeTests/import-icall.ll
new file mode 100644
index 0000000000000..ddeb7fb5c9a2b
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/import-icall.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%S/Inputs/import-icall.yaml < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @local_a() {
+  call void @external()
+  call void @external_weak()
+  ret i8 1
+}
+
+define internal i8 @local_b() {
+  %x = call i8 @local_a()
+  ret i8 %x
+}
+
+define i8 @use_b() {
+  %x = call i8 @local_b()
+  ret i8 %x
+}
+
+
+declare void @external()
+declare extern_weak void @external_weak()
+
+; CHECK:      define hidden i8 @local_a.cfi() {
+; CHECK-NEXT:   call void @external.cfi_jt()
+; CHECK-NEXT:   call void select (i1 icmp ne (void ()* @external_weak, void ()* null), void ()* @external_weak.cfi_jt, void ()* null)()
+; CHECK-NEXT:   ret i8 1
+; CHECK-NEXT: }
+
+; internal @local_b is not the same function as "local_b" in the summary.
+; CHECK:      define internal i8 @local_b() {
+; CHECK-NEXT:   call i8 @local_a()
+
+; CHECK: declare void @external()
+; CHECK: declare extern_weak void @external_weak()
+; CHECK: declare i8 @local_a()
+; CHECK: declare hidden void @external.cfi_jt()
+; CHECK: declare hidden void @external_weak.cfi_jt()
diff --git a/test/Transforms/PGOProfile/memop_size_opt.ll b/test/Transforms/PGOProfile/memop_size_opt.ll
index 19a2b7ed293b2..e11f235a48e76 100644
--- a/test/Transforms/PGOProfile/memop_size_opt.ll
+++ b/test/Transforms/PGOProfile/memop_size_opt.ll
@@ -38,7 +38,7 @@ for.body3:
 ; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i32 1, i1 false)
 ; MEMOP_OPT:   br label %[[MERGE_LABEL:.*]]
 ; MEMOP_OPT: [[DEFAULT_LABEL]]:
-; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false){{[[:space:]]}}
+; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false), !prof [[NEWVP:![0-9]+]]
 ; MEMOP_OPT:   br label %[[MERGE_LABEL]]
 ; MEMOP_OPT: [[MERGE_LABEL]]:
 ; MEMOP_OPT:  switch i64 %conv, label %[[DEFAULT_LABEL2:.*]] [
@@ -48,11 +48,16 @@ for.body3:
 ; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src2, i64 1, i32 1, i1 false)
 ; MEMOP_OPT:   br label %[[MERGE_LABEL2:.*]]
 ; MEMOP_OPT: [[DEFAULT_LABEL2]]:
-; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src2, i64 %conv, i32 1, i1 false){{[[:space:]]}}
+; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src2, i64 %conv, i32 1, i1 false), !prof [[NEWVP]]
 ; MEMOP_OPT:   br label %[[MERGE_LABEL2]]
 ; MEMOP_OPT: [[MERGE_LABEL2]]:
 ; MEMOP_OPT:   br label %for.inc
 ; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99}
+; Should be 457 total left (original total count 556, minus 99 from specialized
+; value 1, which is removed from VP array. Also, we only end up with 5 total
+; values, since the default max number of promotions is 5 and therefore
+; the rest of the values are ignored when extracting the VP metadata.
+; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66}
 
 for.inc:
   %inc = add nsw i32 %j.0, 1
diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
new file mode 100644
index 0000000000000..105afa9def5c1
--- /dev/null
+++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
@@ -0,0 +1,92 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+
+; This test checks that metadata that's invalid after RS4GC is dropped. 
+; We can miscompile if optimizations scheduled after RS4GC uses the
+; metadata that's infact invalid.
+
+declare void @bar()
+
+declare void @baz(i32)
+; Confirm that loadedval instruction does not contain invariant.load metadata.
+; but contains the range metadata.
+; Since loadedval is not marked invariant, it will prevent incorrectly sinking
+; %loadedval in LICM and avoid creation of an unrelocated use of %baseaddr.
+define void @test_invariant_load() gc "statepoint-example" {
+; CHECK-LABEL: @test_invariant_load
+; CHECK: %loadedval = load i32, i32 addrspace(1)* %baseaddr, align 8, !range !0
+bb:
+  br label %outerloopHdr
+
+outerloopHdr:                                              ; preds = %bb6, %bb
+  %baseaddr = phi i32 addrspace(1)* [ undef, %bb ], [ %tmp4, %bb6 ]
+; LICM may sink this load to exit block after RS4GC because it's tagged invariant.
+  %loadedval = load i32, i32 addrspace(1)* %baseaddr, align 8, !range !0, !invariant.load !1
+  br label %innerloopHdr
+
+innerloopHdr:                                              ; preds = %innerlooplatch, %outerloopHdr
+  %tmp4 = phi i32 addrspace(1)* [ %baseaddr, %outerloopHdr ], [ %gep, %innerlooplatch ]
+  br label %innermostloophdr
+
+innermostloophdr:                                              ; preds = %bb6, %innerloopHdr
+  br i1 undef, label %exitblock, label %bb6
+
+bb6:                                              ; preds = %innermostloophdr
+  switch i32 undef, label %innermostloophdr [
+    i32 0, label %outerloopHdr
+    i32 1, label %innerlooplatch
+  ]
+
+innerlooplatch:                                              ; preds = %bb6
+  call void @bar()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %tmp4, i64 8
+  br label %innerloopHdr
+
+exitblock:                                             ; preds = %innermostloophdr
+  %tmp13 = add i32 42, %loadedval
+  call void @baz(i32 %tmp13)
+  unreachable
+}
+
+; drop the noalias metadata.
+define void @test_noalias(i32 %x, i32 addrspace(1)* %p, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_noalias
+; CHECK: %y = load i32, i32 addrspace(1)* %q, align 16
+; CHECK: gc.statepoint
+; CHECK: %p.relocated
+; CHECK-NEXT: %p.relocated.casted = bitcast i8 addrspace(1)* %p.relocated to i32 addrspace(1)*
+; CHECK-NEXT: store i32 %x, i32 addrspace(1)* %p.relocated.casted, align 16
+entry:
+  %y = load i32, i32 addrspace(1)* %q, align 16, !noalias !3
+  call void @baz(i32 %x)
+  store i32 %x, i32 addrspace(1)* %p, align 16, !noalias !4
+  ret void
+}
+
+; drop the dereferenceable metadata
+define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_dereferenceable
+; CHECK: %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+; CHECK-NEXT: %v2 = load i32, i32 addrspace(1)* %v1
+; CHECK: gc.statepoint
+  %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p, !dereferenceable !5
+  %v2 = load i32, i32 addrspace(1)* %v1
+  call void @baz(i32 %x)
+  store i32 %v2, i32 addrspace(1)* %q, align 16
+  ret void
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
+
+; Function Attrs: nounwind readonly
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) #0
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
+attributes #0 = { nounwind readonly }
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{}
+!2 = !{i32 10, i32 1}
+!3 = !{!3}
+!4 = !{!4}
+!5 = !{i64 8}
diff --git a/test/Transforms/SLPVectorizer/X86/arith-add.ll b/test/Transforms/SLPVectorizer/X86/arith-add.ll
index 0266758b27d23..22b2c7422933b 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
@@ -38,6 +39,25 @@ define void @add_v8i64() {
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -111,6 +131,25 @@ define void @add_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @add_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -216,6 +255,25 @@ define void @add_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @add_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
diff --git a/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/test/Transforms/SLPVectorizer/X86/arith-fp.ll
index e00ed849ee4b5..119cf594c905d 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
@@ -69,13 +70,32 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: @buildvector_div_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; SSE-LABEL: @buildvector_div_2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; SSE-NEXT:    ret <2 x double> [[R1]]
+;
+; SLM-LABEL: @buildvector_div_2f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    ret <2 x double> [[R1]]
+;
+; AVX-LABEL: @buildvector_div_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; AVX-NEXT:    ret <2 x double> [[R1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -317,17 +337,48 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @buildvector_div_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; SSE-LABEL: @buildvector_div_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; SSE-NEXT:    ret <4 x double> [[R3]]
+;
+; SLM-LABEL: @buildvector_div_4f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    ret <4 x double> [[R3]]
+;
+; AVX-LABEL: @buildvector_div_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; AVX-NEXT:    ret <4 x double> [[R3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -745,25 +796,80 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
-; CHECK-LABEL: @buildvector_div_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; SSE-LABEL: @buildvector_div_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; SSE-NEXT:    ret <8 x double> [[R7]]
+;
+; SLM-LABEL: @buildvector_div_8f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; SLM-NEXT:    ret <8 x double> [[R7]]
+;
+; AVX-LABEL: @buildvector_div_8f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; AVX-NEXT:    ret <8 x double> [[R7]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
diff --git a/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
index 95875d7f01fd1..4763a9a2bf12b 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
@@ -54,6 +55,41 @@ define void @mul_v8i64() {
 ; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @mul_v8i64(
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    ret void
+;
 ; AVX1-LABEL: @mul_v8i64(
 ; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
 ; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
@@ -162,6 +198,25 @@ define void @mul_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @mul_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @mul_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -267,6 +322,25 @@ define void @mul_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @mul_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @mul_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
diff --git a/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
index 85838369e2266..2bbaaca02d88b 100644
--- a/test/Transforms/SLPVectorizer/X86/arith-sub.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
@@ -38,6 +39,25 @@ define void @sub_v8i64() {
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @sub_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@@ -111,6 +131,25 @@ define void @sub_v16i32() {
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @sub_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@@ -216,6 +255,25 @@ define void @sub_v32i16() {
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
+; SLM-LABEL: @sub_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
diff --git a/test/Transforms/SafeStack/X86/debug-loc.ll b/test/Transforms/SafeStack/X86/debug-loc.ll
index 88cda693b2932..d6b217142bfef 100644
--- a/test/Transforms/SafeStack/X86/debug-loc.ll
+++ b/test/Transforms/SafeStack/X86/debug-loc.ll
@@ -37,10 +37,10 @@ entry:
 
 ; CHECK-DAG: ![[VAR_ARG]] = !DILocalVariable(name: "zzz"
 ; 100 aligned up to 8
-; CHECK-DAG: ![[EXPR_ARG]] = !DIExpression(DW_OP_minus, 104)
+; CHECK-DAG: ![[EXPR_ARG]] = !DIExpression(DW_OP_constu, 104, DW_OP_minus
 
 ; CHECK-DAG: ![[VAR_LOCAL]] = !DILocalVariable(name: "xxx"
-; CHECK-DAG: ![[EXPR_LOCAL]] = !DIExpression(DW_OP_minus, 208)
+; CHECK-DAG: ![[EXPR_LOCAL]] = !DIExpression(DW_OP_constu, 208, DW_OP_minus
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
diff --git a/test/Transforms/SafeStack/X86/debug-loc2.ll b/test/Transforms/SafeStack/X86/debug-loc2.ll
index 8059a722fd45c..731516c3c65ed 100644
--- a/test/Transforms/SafeStack/X86/debug-loc2.ll
+++ b/test/Transforms/SafeStack/X86/debug-loc2.ll
@@ -84,8 +84,8 @@ attributes #4 = { nounwind }
 !13 = !DILocation(line: 5, column: 3, scope: !6)
 !14 = !DILocation(line: 6, column: 3, scope: !6)
 
-; CHECK-DAG: ![[X1_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_minus, 4)
-; CHECK-DAG: ![[X2_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_minus, 8)
+; CHECK-DAG: ![[X1_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_constu, 4, DW_OP_minus)
+; CHECK-DAG: ![[X2_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_constu, 8, DW_OP_minus)
 !15 = !DIExpression(DW_OP_deref)
 !16 = !DILocation(line: 5, column: 7, scope: !6)
 !17 = !DILocation(line: 8, column: 3, scope: !6)
@@ -95,4 +95,4 @@ attributes #4 = { nounwind }
 !21 = !DILocation(line: 10, column: 1, scope: !22)
 !22 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 1)
 !23 = !DIExpression()
-!24 = !DIExpression(DW_OP_minus, 42)
+!24 = !DIExpression(DW_OP_constu, 42, DW_OP_minus)
diff --git a/test/Transforms/Util/PredicateInfo/pr33456.ll b/test/Transforms/Util/PredicateInfo/pr33456.ll
new file mode 100644
index 0000000000000..f1cc83a071b96
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/pr33456.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+; Don't insert predicate info for conditions with a single target.
+@a = global i32 1, align 4
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+define i32 @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @d, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP13:%.*]]
+; CHECK:         [[TMP4:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[TMP5]], 1
+; CHECK-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP8:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[TMP9]], label [[TMP9]]
+; CHECK:         [[DOT0:%.*]] = phi i32 [ [[TMP4]], [[TMP7]] ], [ [[TMP4]], [[TMP7]] ], [ [[DOT1:%.*]], [[TMP13]] ], [ [[TMP4]], [[TMP3]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[DOT0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label [[TMP13]], label [[TMP13]]
+; CHECK:         [[DOT1]] = phi i32 [ [[DOT0]], [[TMP9]] ], [ [[DOT0]], [[TMP9]] ], [ undef, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* @e, align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[TMP15]], label [[TMP16:%.*]], label [[TMP9]]
+; CHECK:         ret i32 0
+;
+  %1 = load i32, i32* @d, align 4
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %3, label %13
+
+; <label>:3:                                      ; preds = %0
+  %4 = load i32, i32* @a, align 4
+  %5 = load i32, i32* @c, align 4
+  %6 = icmp slt i32 %5, 1
+  br i1 %6, label %7, label %9
+
+; <label>:7:                                      ; preds = %3
+  %8 = icmp eq i32 %4, 0
+  br i1 %8, label %9, label %9
+
+; <label>:9:                                      ; preds = %13, %7, %7, %3
+  %.0 = phi i32 [ %4, %7 ], [ %4, %7 ], [ %.1, %13 ], [ %4, %3 ]
+  %10 = load i32, i32* @b, align 4
+  %11 = sdiv i32 %10, %.0
+  %12 = icmp eq i32 %11, 0
+  br i1 %12, label %13, label %13
+
+; <label>:13:                                     ; preds = %9, %9, %0
+  %.1 = phi i32 [ %.0, %9 ], [ %.0, %9 ], [ undef, %0 ]
+  %14 = load i32, i32* @e, align 4
+  %15 = icmp eq i32 %14, 0
+  br i1 %15, label %16, label %9
+
+; <label>:16:                                     ; preds = %13
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
diff --git a/test/Transforms/Util/PredicateInfo/pr33457.ll b/test/Transforms/Util/PredicateInfo/pr33457.ll
new file mode 100644
index 0000000000000..b975ade9321db
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/pr33457.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+; Don't insert predicate info for conditions with a single target.
+@a = global i32 6, align 4
+@c = global i32 -1, align 4
+@e = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+@d = common global i32 0, align 4
+@b = common global [6 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:    store i32 6, i32* @e, align 4
+; CHECK-NEXT:    br label [[TMP1:%.*]]
+; CHECK:         [[TMP2:%.*]] = load i32, i32* @d, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [6 x i32], [6 x i32]* @b, i64 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %thread-pre-split, label [[TMP9:%.*]]
+; CHECK:         [[TMP10:%.*]] = load i32, i32* @e, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP12]]
+; CHECK:       thread-pre-split:
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, i32* @e, align 4
+; CHECK-NEXT:    br label [[TMP12]]
+; CHECK:         [[TMP13:%.*]] = phi i32 [ [[DOTPR]], %thread-pre-split ], [ [[TMP10]], [[TMP9]] ], [ [[TMP10]], [[TMP9]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP15]]
+; CHECK:         br i1 [[TMP14]], label [[TMP16:%.*]], label [[TMP17:%.*]]
+; CHECK:         br label [[TMP17]]
+; CHECK:         [[DOT0:%.*]] = phi i32 [ 1, [[TMP16]] ], [ -1, [[TMP15]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[DOT0]], 8693
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i32 [[TMP20]], -1
+; CHECK-NEXT:    store i32 [[TMP21]], i32* @d, align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp slt i32 [[TMP20]], -2
+; CHECK-NEXT:    br i1 [[TMP22]], label [[TMP1]], label [[TMP23:%.*]]
+; CHECK:         ret i32 0
+;
+  store i32 6, i32* @e, align 4
+  br label %1
+
+; <label>:1:                                      ; preds = %17, %0
+  %2 = load i32, i32* @d, align 4
+  %3 = sext i32 %2 to i64
+  %4 = getelementptr inbounds [6 x i32], [6 x i32]* @b, i64 0, i64 %3
+  %5 = load i32, i32* %4, align 4
+  %6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %5) #2
+  %7 = load i32, i32* @a, align 4
+  %8 = icmp eq i32 %7, 0
+  br i1 %8, label %thread-pre-split, label %9
+
+; <label>:9:                                      ; preds = %1
+  %10 = load i32, i32* @e, align 4
+  %11 = icmp eq i32 %10, 0
+  br i1 %11, label %12, label %12
+
+thread-pre-split:                                 ; preds = %1
+  %.pr = load i32, i32* @e, align 4
+  br label %12
+
+; <label>:12:                                     ; preds = %thread-pre-split, %9, %9
+  %13 = phi i32 [ %.pr, %thread-pre-split ], [ %10, %9 ], [ %10, %9 ]
+  %14 = icmp ne i32 %13, 0
+  br i1 %14, label %15, label %15
+
+; <label>:15:                                     ; preds = %12, %12
+  br i1 %14, label %16, label %17
+
+; <label>:16:                                     ; preds = %15
+  br label %17
+
+; <label>:17:                                     ; preds = %16, %15
+  %.0 = phi i32 [ 1, %16 ], [ -1, %15 ]
+  %18 = and i32 %.0, 8693
+  %19 = load i32, i32* @c, align 4
+  %20 = xor i32 %18, %19
+  %21 = xor i32 %20, -1
+  store i32 %21, i32* @d, align 4
+  %22 = icmp slt i32 %20, -2
+  br i1 %22, label %1, label %23
+
+; <label>:23:                                     ; preds = %17
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+