25 files changed, 1161 insertions, 56 deletions
diff --git a/test/Transforms/CorrelatedValuePropagation/select.ll b/test/Transforms/CorrelatedValuePropagation/select.ll
index 5501438f690f5..d88e3e462a205 100644
--- a/test/Transforms/CorrelatedValuePropagation/select.ll
+++ b/test/Transforms/CorrelatedValuePropagation/select.ll
@@ -51,3 +51,25 @@ else:
   ret i8 %b
 }
 
+@c = global i32 0, align 4
+@b = global i32 0, align 4
+
+; CHECK-LABEL: @PR23752(
+define i32 @PR23752() {
+entry:
+  br label %for.body
+
+for.body:
+  %phi = phi i32 [ 0, %entry ], [ %sel, %for.body ]
+  %sel = select i1 icmp sgt (i32* @b, i32* @c), i32 %phi, i32 1
+  %cmp = icmp ne i32 %sel, 1
+  br i1 %cmp, label %for.body, label %if.end
+
+; CHECK:      %[[sel:.*]] = select i1 icmp sgt (i32* @b, i32* @c), i32 0, i32 1
+; CHECK-NEXT: %[[cmp:.*]] = icmp ne i32 %[[sel]], 1
+; CHECK-NEXT: br i1 %[[cmp]]
+
+if.end:
+  ret i32 %sel
+; CHECK: ret i32 %[[sel]]
+}
diff --git a/test/Transforms/GVN/unreachable_block_infinite_loop.ll b/test/Transforms/GVN/unreachable_block_infinite_loop.ll
index fca5a28b38dd6..a47e9e4c3a044 100644
--- a/test/Transforms/GVN/unreachable_block_infinite_loop.ll
+++ b/test/Transforms/GVN/unreachable_block_infinite_loop.ll
@@ -12,3 +12,32 @@ unreachable_block:
     ret i32 %a
 }
 
+define i32 @pr23096_test0() {
+entry:
+  br label %bb0
+
+bb1:
+  %ptr1 = ptrtoint i32* %ptr2 to i64
+  %ptr2 = inttoptr i64 %ptr1 to i32*
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %phi = phi i32* [ undef, %entry ], [ %ptr2, %bb1 ]
+  %load = load i32, i32* %phi
+  ret i32 %load
+}
+
+define i32 @pr23096_test1() {
+entry:
+  br label %bb0
+
+bb1:
+  %ptr1 = getelementptr i32, i32* %ptr2, i32 0
+  %ptr2 = getelementptr i32, i32* %ptr1, i32 0
+  br i1 undef, label %bb0, label %bb1
+
+bb0:
+  %phi = phi i32* [ undef, %entry ], [ %ptr2, %bb1 ]
+  %load = load i32, i32* %phi
+  ret i32 %load
+}
diff --git a/test/Transforms/IndVarSimplify/exit_value_test2.ll b/test/Transforms/IndVarSimplify/exit_value_test2.ll
new file mode 100644
index 0000000000000..24e3e95a89182
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/exit_value_test2.ll
@@ -0,0 +1,52 @@
+; PR23538
+; RUN: opt < %s -indvars -loop-deletion -S | FileCheck %s
+
+; Check IndVarSimplify should not replace exit value because or else
+; udiv will be introduced by expand and the cost will be high.
+;
+; CHECK-LABEL: @_Z3fooPKcjj(
+; CHECK-NOT: udiv
+
+declare void @_Z3mixRjj(i32* dereferenceable(4), i32)
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+define i32 @_Z3fooPKcjj(i8* nocapture readonly %s, i32 %len, i32 %c) {
+entry:
+  %a = alloca i32, align 4
+  %tmp = bitcast i32* %a to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %tmp)
+  store i32 -1640531527, i32* %a, align 4
+  %cmp8 = icmp ugt i32 %len, 11
+  br i1 %cmp8, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:                                 ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.lr.ph
+  %keylen.010 = phi i32 [ %len, %while.body.lr.ph ], [ %sub, %while.body ]
+  %s.addr.09 = phi i8* [ %s, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+  %tmp1 = bitcast i8* %s.addr.09 to i32*
+  %tmp2 = load i32, i32* %tmp1, align 4
+  %shl.i = shl i32 %tmp2, 1
+  %and.i = and i32 %shl.i, 16843008
+  %tmp3 = load i32, i32* %a, align 4
+  %sub.i = add i32 %tmp3, %tmp2
+  %add = sub i32 %sub.i, %and.i
+  store i32 %add, i32* %a, align 4
+  %add.ptr = getelementptr inbounds i8, i8* %s.addr.09, i64 12
+  %sub = add i32 %keylen.010, -12
+  %cmp = icmp ugt i32 %sub, 11
+  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %sub.lcssa = phi i32 [ %sub, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %keylen.0.lcssa = phi i32 [ %sub.lcssa, %while.cond.while.end_crit_edge ], [ %len, %entry ]
+  call void @_Z3mixRjj(i32* dereferenceable(4) %a, i32 %keylen.0.lcssa)
+  %tmp4 = load i32, i32* %a, align 4
+  call void @llvm.lifetime.end(i64 4, i8* %tmp)
+  ret i32 %tmp4
+}
diff --git a/test/Transforms/IndVarSimplify/exit_value_test3.ll b/test/Transforms/IndVarSimplify/exit_value_test3.ll
new file mode 100644
index 0000000000000..2051d2a46b20a
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/exit_value_test3.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -indvars -loop-deletion -S |FileCheck %s
+
+; Check IndVarSimplify should replace exit value even if the expansion cost
+; is high because the loop can be deleted after the exit value rewrite.
+;
+; CHECK-LABEL: @_Z3fooPKcjj(
+; CHECK: udiv
+; CHECK: [[LABEL:^[a-zA-Z0-9_.]+]]:
+; CHECK-NOT: br {{.*}} [[LABEL]]
+
+define i32 @_Z3fooPKcjj(i8* nocapture readnone %s, i32 %len, i32 %c) #0 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %klen.0 = phi i32 [ %len, %entry ], [ %sub, %while.cond ]
+  %cmp = icmp ugt i32 %klen.0, 11
+  %sub = add i32 %klen.0, -12
+  br i1 %cmp, label %while.cond, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %klen.0.lcssa = phi i32 [ %klen.0, %while.cond ]
+  ret i32 %klen.0.lcssa
+}
diff --git a/test/Transforms/IndVarSimplify/lcssa-preservation.ll b/test/Transforms/IndVarSimplify/lcssa-preservation.ll
index f69c96ce02105..5d502f3b6eafc 100644
--- a/test/Transforms/IndVarSimplify/lcssa-preservation.ll
+++ b/test/Transforms/IndVarSimplify/lcssa-preservation.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -indvars -S | FileCheck %s
-;
+; RUN: opt < %s -indvars -replexitval=always -S | FileCheck %s
 ; Make sure IndVars preserves LCSSA form, especially across loop nests. 
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index 8319624b87c9d..93a64e6b49bd4 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -85,3 +85,11 @@ define float @test8(float %V) {
 ; CHECK-NEXT: %[[trunc:.*]] = fptrunc double %frem to float
 ; CHECK-NEXT: ret float %trunc
 }
+
+; CHECK-LABEL: @test_fptrunc_fptrunc
+; CHECK-NOT: fptrunc double {{.*}} to half
+define half @test_fptrunc_fptrunc(double %V) {
+  %t1 = fptrunc double %V to float
+  %t2 = fptrunc float %t1 to half
+  ret half %t2
+}
diff --git a/test/Transforms/InstCombine/load-bitcast32.ll b/test/Transforms/InstCombine/load-bitcast32.ll
new file mode 100644
index 0000000000000..b1c78a8a314eb
--- /dev/null
+++ b/test/Transforms/InstCombine/load-bitcast32.ll
@@ -0,0 +1,79 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+
+define i64* @test1(i8* %x) {
+entry:
+; CHECK-LABEL: @test1(
+; CHECK: load i64, i64*
+; CHECK: ret
+  %a = bitcast i8* %x to i64*
+  %b = load i64, i64* %a
+  %c = inttoptr i64 %b to i64*
+
+  ret i64* %c
+}
+
+define i32* @test2(i8* %x) {
+entry:
+; CHECK-LABEL: @test2(
+; CHECK: load i32*, i32**
+; CHECK: ret
+  %a = bitcast i8* %x to i32*
+  %b = load i32, i32* %a
+  %c = inttoptr i32 %b to i32*
+
+  ret i32* %c
+}
+
+define i64* @test3(i8* %x) {
+entry:
+; CHECK-LABEL: @test3(
+; CHECK: load i64*, i64**
+; CHECK: ret
+  %a = bitcast i8* %x to i32*
+  %b = load i32, i32* %a
+  %c = inttoptr i32 %b to i64*
+
+  ret i64* %c
+}
+
+define i64 @test4(i8* %x) {
+entry:
+; CHECK-LABEL: @test4(
+; CHECK: load i32, i32*
+; CHECK: zext
+; CHECK: ret
+  %a = bitcast i8* %x to i64**
+  %b = load i64*, i64** %a
+  %c = ptrtoint i64* %b to i64
+
+  ret i64 %c
+}
+
+define i32 @test5(i8* %x) {
+entry:
+; CHECK-LABEL: @test5(
+; CHECK: load i32, i32*
+; CHECK: ret
+  %a = bitcast i8* %x to i32**
+  %b = load i32*, i32** %a
+  %c = ptrtoint i32* %b to i32
+
+  ret i32 %c
+}
+
+define i64 @test6(i8* %x) {
+entry:
+; CHECK-LABEL: @test6(
+; CHECK: load i32, i32*
+; CHECK: zext
+; CHECK: ret
+  %a = bitcast i8* %x to i32**
+  %b = load i32*, i32** %a
+  %c = ptrtoint i32* %b to i64
+
+  ret i64 %c
+}
+
diff --git a/test/Transforms/InstCombine/load-bitcast64.ll b/test/Transforms/InstCombine/load-bitcast64.ll
new file mode 100644
index 0000000000000..d14c686d83ea3
--- /dev/null
+++ b/test/Transforms/InstCombine/load-bitcast64.ll
@@ -0,0 +1,78 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "p:64:64:64"
+
+
+define i64* @test1(i8* %x) {
+entry:
+; CHECK-LABEL: @test1(
+; CHECK: load i64*, i64**
+; CHECK: ret
+  %a = bitcast i8* %x to i64*
+  %b = load i64, i64* %a
+  %c = inttoptr i64 %b to i64*
+
+  ret i64* %c
+}
+
+define i32* @test2(i8* %x) {
+entry:
+; CHECK-LABEL: @test2(
+; CHECK: load i32, i32*
+; CHECK: ret
+  %a = bitcast i8* %x to i32*
+  %b = load i32, i32* %a
+  %c = inttoptr i32 %b to i32*
+
+  ret i32* %c
+}
+
+define i64* @test3(i8* %x) {
+entry:
+; CHECK-LABEL: @test3(
+; CHECK: load i32, i32*
+; CHECK: ret
+  %a = bitcast i8* %x to i32*
+  %b = load i32, i32* %a
+  %c = inttoptr i32 %b to i64*
+
+  ret i64* %c
+}
+
+define i64 @test4(i8* %x) {
+entry:
+; CHECK-LABEL: @test4(
+; CHECK: load i64, i64*
+; CHECK: ret
+  %a = bitcast i8* %x to i64**
+  %b = load i64*, i64** %a
+  %c = ptrtoint i64* %b to i64
+
+  ret i64 %c
+}
+
+define i32 @test5(i8* %x) {
+entry:
+; CHECK-LABEL: @test5(
+; CHECK: load i64, i64*
+; CHECK: trunc
+; CHECK: ret
+  %a = bitcast i8* %x to i32**
+  %b = load i32*, i32** %a
+  %c = ptrtoint i32* %b to i32
+
+  ret i32 %c
+}
+
+define i64 @test6(i8* %x) {
+entry:
+; CHECK-LABEL: @test6(
+; CHECK: load i64, i64*
+; CHECK: ret
+  %a = bitcast i8* %x to i32**
+  %b = load i32*, i32** %a
+  %c = ptrtoint i32* %b to i64
+
+  ret i64 %c
+}
+
diff --git a/test/Transforms/InstCombine/pr23751.ll b/test/Transforms/InstCombine/pr23751.ll
new file mode 100644
index 0000000000000..d7840be2f8374
--- /dev/null
+++ b/test/Transforms/InstCombine/pr23751.ll
@@ -0,0 +1,13 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+@d = common global i32 0, align 4
+
+define i1 @f(i8 zeroext %p) #1 {
+; CHECK-NOT: ret i1 false
+  %1 = zext i8 %p to i32
+  %2 = load i32, i32* @d, align 4
+  %3 = or i32 %2, -2
+  %4 = add nsw i32 %3, %1
+  %5 = icmp ugt i32 %1, %4
+  ret i1 %5
+}
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index e4bc96cff1764..27e487b4815e5 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -1532,3 +1532,16 @@ define i32 @test_max_of_min(i32 %a) {
   %s1 = select i1 %c1, i32 %s0, i32 -1
   ret i32 %s1
 }
+
+
+define i32 @PR23757(i32 %x) {
+; CHECK-LABEL: @PR23757
+; CHECK:      %[[cmp:.*]] = icmp eq i32 %x, 2147483647
+; CHECK-NEXT: %[[add:.*]] = add nsw i32 %x, 1
+; CHECK-NEXT: %[[sel:.*]] = select i1 %[[cmp]], i32 -2147483648, i32 %[[add]]
+; CHECK-NEXT: ret i32 %[[sel]]
+  %cmp = icmp eq i32 %x, 2147483647
+  %add = add nsw i32 %x, 1
+  %sel = select i1 %cmp, i32 -2147483648, i32 %add
+  ret i32 %sel
+}
diff --git a/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll b/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll
new file mode 100644
index 0000000000000..e5694fbeb0ce5
--- /dev/null
+++ b/test/Transforms/LoopUnroll/full-unroll-bad-cost.ll
@@ -0,0 +1,58 @@
+; RUN: opt -S -loop-unroll < %s | FileCheck %s
+
+; LLVM should not try to fully unroll this loop.
+
+declare void @f()
+declare void @g()
+declare void @h()
+
+define void @trivial_loop() {
+; CHECK-LABEL: @trivial_loop(
+ entry:
+  br label %loop
+
+ loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %idx.inc = add i32 %idx, 1
+  call void @f()
+  call void @g()
+  call void @h()
+  call void @f()
+  call void @g()
+  call void @h()
+  call void @f()
+  call void @g()
+  call void @h()
+  call void @f()
+  call void @g()
+  call void @h()
+  call void @f()
+  call void @g()
+  call void @h()
+  %be = icmp slt i32 %idx, 268435456
+  br i1 %be, label %loop, label %exit
+
+; CHECK: loop:
+; CHECK-NEXT:  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+; CHECK-NEXT:  %idx.inc = add i32 %idx, 1
+; CHECK-NEXT:  call void @f()
+; CHECK-NEXT:  call void @g()
+; CHECK-NEXT:  call void @h()
+; CHECK-NEXT:  call void @f()
+; CHECK-NEXT:  call void @g()
+; CHECK-NEXT:  call void @h()
+; CHECK-NEXT:  call void @f()
+; CHECK-NEXT:  call void @g()
+; CHECK-NEXT:  call void @h()
+; CHECK-NEXT:  call void @f()
+; CHECK-NEXT:  call void @g()
+; CHECK-NEXT:  call void @h()
+; CHECK-NEXT:  call void @f()
+; CHECK-NEXT:  call void @g()
+; CHECK-NEXT:  call void @h()
+; CHECK-NEXT:  %be = icmp slt i32 %idx, 268435456
+; CHECK-NEXT:  br i1 %be, label %loop, label %exit
+
+ exit:
+  ret void
+}
diff --git a/test/Transforms/LoopUnroll/full-unroll-bad-geps.ll b/test/Transforms/LoopUnroll/full-unroll-bad-geps.ll
index 4c99bc73880ba..ac814526647e9 100644
--- a/test/Transforms/LoopUnroll/full-unroll-bad-geps.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-bad-geps.ll
@@ -1,5 +1,5 @@
 ; Check that we don't crash on corner cases.
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=10  -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=20 -o /dev/null
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=20 -o /dev/null
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo1() {
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
index 2dab2fbf2e498..904a65a1bc0e3 100644
--- a/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics.ll
@@ -1,8 +1,8 @@
 ; In this test we check how heuristics for complete unrolling work. We have
 ; three knobs:
 ;  1) -unroll-threshold
-;  2) -unroll-absolute-threshold and
-;  3) -unroll-percent-of-optimized-for-complete-unroll
+;  3) -unroll-percent-dynamic-cost-saved-threshold and
+;  2) -unroll-dynamic-cost-savings-discount
 ;
 ; They control loop-unrolling according to the following rules:
 ;  * If size of unrolled loop exceeds the absoulte threshold, we don't unroll
@@ -17,10 +17,10 @@
 ; optimizations to remove ~55% of the instructions, the loop body size is 9,
 ; and unrolled size is 65.
 
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=10  -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=20 | FileCheck %s -check-prefix=TEST1
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=100 -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=20 | FileCheck %s -check-prefix=TEST2
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=100 -unroll-threshold=10  -unroll-percent-of-optimized-for-complete-unroll=80 | FileCheck %s -check-prefix=TEST3
-; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-absolute-threshold=100 -unroll-threshold=100 -unroll-percent-of-optimized-for-complete-unroll=80 | FileCheck %s -check-prefix=TEST4
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=20 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s -check-prefix=TEST1
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=20 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s -check-prefix=TEST2
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10  -unroll-percent-dynamic-cost-saved-threshold=80 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s -check-prefix=TEST3
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=100 -unroll-percent-dynamic-cost-saved-threshold=80 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s -check-prefix=TEST4
 
 ; If the absolute threshold is too low, or if we can't optimize away requested
 ; percent of instructions, we shouldn't unroll:
diff --git a/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
index 4cd703f64582a..f16ee4171da92 100644
--- a/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
+++ b/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses=true | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -enable-interleaved-mem-accesses=true | FileCheck %s --check-prefix=FORCE-VEC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
@@ -102,26 +102,23 @@ for.end:                                          ; preds = %for.body
 ;   }
 
 ; CHECK-LABEL: @ptr_ind_plus2(
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: load i32, i32*
-; CHECK: mul nsw i32
-; CHECK: mul nsw i32
-; CHECK: add nsw i32
-; CHECK: add nsw i32
-; CHECK: %index.next = add i64 %index, 2
-; CHECK: %21 = icmp eq i64 %index.next, 1024
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
 
 ; FORCE-VEC-LABEL: @ptr_ind_plus2(
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
-; FORCE-VEC: load i32, i32*
-; FORCE-VEC: insertelement <2 x i32>
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; FORCE-VEC: mul nsw <2 x i32>
 ; FORCE-VEC: add nsw <2 x i32>
 ; FORCE-VEC: %index.next = add i64 %index, 2
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll
new file mode 100644
index 0000000000000..d7237a5c27dc8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -0,0 +1,467 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check vectorization on an interleaved load group of factor 2 and an interleaved
+; store group of factor 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx0, align 4
+  %tmp1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
+  %tmp2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp, %C
+  %mul = mul nsw i32 %tmp2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+%struct.ST3 = type { i32, i32, i32 }
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %tmp = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %tmp1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %tmp2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %tmp, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %tmp2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of factor 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %x, align 4
+  %add = add nsw i32 %tmp, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %tmp1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %tmp1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %tmp2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %tmp2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %tmp3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %tmp3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check vectorization on an interleaved store group of factor 4.
+
+; void test_struct_store4(int *A, struct ST4 *B) {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     B[i].x = X + 1;
+;     B[i].y = X * 2;
+;     B[i].z = X + 3;
+;     B[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
+  %tmp = load i32, i32* %ptr.024, align 4
+  %add = add nsw i32 %tmp, 1
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %add3 = add nsw i32 %tmp, 3
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
+  store i32 %add3, i32* %z, align 4
+  %add6 = add nsw i32 %tmp, 4
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
+  store i32 %add6, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 and
+; a reverse interleaved store group of factor 2.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+%struct.ST2 = type { i32, i32 }
+
+define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %x, align 4
+  %tmp1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %tmp, %tmp1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %tmp2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %tmp2, %tmp1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements).
+
+; void even_load(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shl nsw <4 x i32> %strided.vec, <i32 1, i32 1, i32 1, i32 1>
+
+define void @even_load(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load2_store2(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec
+
+define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %tmp1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
+  %tmp2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp2, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %tmp3 = load i32, i32* %arrayidx, align 4
+  %tmp4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %tmp4, %tmp3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load3_store3(int *A) {
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     *A++ += i;
+;     *A++ += i;
+;     *A++ += i;
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
+
+define void @mixed_load3_store3(i32* nocapture %A) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
+  %tmp = load i32, i32* %A.addr.012, align 4
+  %add = add i32 %tmp, %i.013
+  store i32 %add, i32* %A.addr.012, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
+  %tmp1 = load i32, i32* %incdec.ptr, align 4
+  %add2 = add i32 %tmp1, %i.013
+  store i32 %add2, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
+  %tmp2 = load i32, i32* %incdec.ptr1, align 4
+  %add4 = add i32 %tmp2, %i.013
+  store i32 %add4, i32* %incdec.ptr1, align 4
+  %inc = add nuw nsw i32 %i.013, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization on interleaved access groups with members having different
+; kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK-LABEL: @int_float_struct(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
+; CHECK: add nsw <4 x i32>
+; CHECK: fadd fast <4 x float>
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %add = add nsw i32 %tmp, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %tmp1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %tmp1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll b/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll
new file mode 100644
index 0000000000000..8771dd26948b5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @fn1
+define void @fn1() {
+entry-block:
+  br label %middle
+
+middle:
+  %0 = phi {}* [ %3, %middle ], [ inttoptr (i64 0 to {}*), %entry-block ]
+  %1 = bitcast {}* %0 to i8*
+  %2 = getelementptr i8, i8* %1, i64 1
+  %3 = bitcast i8* %2 to {}*
+  %4 = icmp eq i8* %2, undef
+  br i1 %4, label %exit, label %middle
+
+; CHECK:      %[[phi:.*]] = phi {}* [ %3, %middle ], [ null, %entry-block ]
+; CHECK-NEXT: %[[bc1:.*]] = bitcast {}* %[[phi]] to i8*
+; CHECK-NEXT: %[[gep:.*]] = getelementptr i8, i8* %[[bc1]], i64 1
+; CHECK-NEXT: %[[bc2:.*]] = bitcast i8* %[[gep]] to {}*
+; CHECK-NEXT: %[[cmp:.*]] = icmp eq i8* %[[gep]], undef
+; CHECK-NEXT: br i1 %[[cmp]],
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
index b2083cb5c700e..99eba5e280944 100644
--- a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
+++ b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
@@ -63,16 +63,16 @@ lpad:
   resume { i8*, i32 } zeroinitializer
 }
 
-define i8 @call_same_range() {
-; CHECK-LABEL: @call_same_range
+define i8 @call_with_same_range() {
+; CHECK-LABEL: @call_with_same_range
 ; CHECK: tail call i8 @call_with_range
   bitcast i8 0 to i8
   %out = call i8 @dummy(), !range !0
   ret i8 %out
 }
 
-define i8 @invoke_same_range() {
-; CHECK-LABEL: @invoke_same_range()
+define i8 @invoke_with_same_range() {
+; CHECK-LABEL: @invoke_with_same_range()
 ; CHECK: tail call i8 @invoke_with_range()
   %out = invoke i8 @dummy() to label %next unwind label %lpad, !range !0
 
diff --git a/test/Transforms/MergeFunc/linkonce_odr.ll b/test/Transforms/MergeFunc/linkonce_odr.ll
new file mode 100644
index 0000000000000..1ad0d727f83b8
--- /dev/null
+++ b/test/Transforms/MergeFunc/linkonce_odr.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; Replacments should be totally ordered on the function name.
+; If we don't do this we  can end up with one module defining a thunk for @funA
+; and another module defining a thunk for @funB.
+;
+; The problem with this is that the linker could then choose these two stubs
+; each of the two modules and we end up with two stubs calling each other.
+
+; CHECK-LABEL: define linkonce_odr i32 @funA
+; CHECK-NEXT:    add
+; CHECK:         ret
+
+; CHECK-LABEL: define linkonce_odr i32 @funB
+; CHECK-NEXT:    tail call i32 @funA(i32 %0, i32 %1)
+; CHECK-NEXT:    ret
+
+define linkonce_odr i32 @funB(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %x, %sum
+  %sum3 = add i32 %x, %sum2
+  ret i32 %sum3
+}
+
+define linkonce_odr i32 @funA(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %x, %sum
+  %sum3 = add i32 %x, %sum2
+  ret i32 %sum3
+}
diff --git a/test/Transforms/NaryReassociate/NVPTX/nary-gep.ll b/test/Transforms/NaryReassociate/NVPTX/nary-gep.ll
index a620c98ec18f0..d08c6f60c041a 100644
--- a/test/Transforms/NaryReassociate/NVPTX/nary-gep.ll
+++ b/test/Transforms/NaryReassociate/NVPTX/nary-gep.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -nary-reassociate -S | FileCheck %s
+; RUN: opt < %s -nary-reassociate -early-cse -S | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
@@ -27,24 +27,37 @@ define void @reassociate_gep(float* %a, i64 %i, i64 %j) {
 
 ; foo(&a[sext(j)]);
 ; foo(&a[sext(i +nsw j)]);
+; foo(&a[sext((i +nsw j) +nsw i)]);
 ;   =>
-; t = &a[sext(j)];
-; foo(t);
-; foo(t + sext(i));
+; t1 = &a[sext(j)];
+; foo(t1);
+; t2 = t1 + sext(i);
+; foo(t2);
+; t3 = t2 + sext(i); // sext(i) should be GVN'ed.
+; foo(t3);
 define void @reassociate_gep_nsw(float* %a, i32 %i, i32 %j) {
 ; CHECK-LABEL: @reassociate_gep_nsw(
-  %1 = add nsw i32 %i, %j
-  %idxprom.1 = sext i32 %1 to i64
   %idxprom.j = sext i32 %j to i64
-  %2 = getelementptr float, float* %a, i64 %idxprom.j
+  %1 = getelementptr float, float* %a, i64 %idxprom.j
 ; CHECK: [[t1:[^ ]+]] = getelementptr float, float* %a, i64 %idxprom.j
-  call void @foo(float* %2)
+  call void @foo(float* %1)
 ; CHECK: call void @foo(float* [[t1]])
-  %3 = getelementptr float, float* %a, i64 %idxprom.1
+
+  %2 = add nsw i32 %i, %j
+  %idxprom.2 = sext i32 %2 to i64
+  %3 = getelementptr float, float* %a, i64 %idxprom.2
 ; CHECK: [[sexti:[^ ]+]] = sext i32 %i to i64
 ; CHECK: [[t2:[^ ]+]] = getelementptr float, float* [[t1]], i64 [[sexti]]
   call void @foo(float* %3)
 ; CHECK: call void @foo(float* [[t2]])
+
+  %4 = add nsw i32 %2, %i
+  %idxprom.4 = sext i32 %4 to i64
+  %5 = getelementptr float, float* %a, i64 %idxprom.4
+; CHECK: [[t3:[^ ]+]] = getelementptr float, float* [[t2]], i64 [[sexti]]
+  call void @foo(float* %5)
+; CHECK: call void @foo(float* [[t3]])
+
   ret void
 }
 
diff --git a/test/Transforms/Reassociate/basictest.ll b/test/Transforms/Reassociate/basictest.ll
index 015d3b0bee9f2..caaf7726514de 100644
--- a/test/Transforms/Reassociate/basictest.ll
+++ b/test/Transforms/Reassociate/basictest.ll
@@ -202,8 +202,8 @@ define i32 @test14(i32 %X1, i32 %X2) {
   ret i32 %D
 
 ; CHECK-LABEL: @test14
-; CHECK-NEXT: sub i32 %X1, %X2
-; CHECK-NEXT: mul i32 %B2, 47
+; CHECK-NEXT: %[[SUB:.*]] = sub i32 %X1, %X2
+; CHECK-NEXT: mul i32 %[[SUB]], 47
 ; CHECK-NEXT: ret i32
 }
 
diff --git a/test/Transforms/Reassociate/canonicalize-neg-const.ll b/test/Transforms/Reassociate/canonicalize-neg-const.ll
index e85a963f6ddac..465460cb53b1b 100644
--- a/test/Transforms/Reassociate/canonicalize-neg-const.ll
+++ b/test/Transforms/Reassociate/canonicalize-neg-const.ll
@@ -49,18 +49,6 @@ define double @test3(double %x, double %y) {
   ret double %mul3
 }
 
-; Canonicalize (x - -1234 * y)
-define i64 @test4(i64 %x, i64 %y) {
-; CHECK-LABEL: @test4
-; CHECK-NEXT: mul i64 %y, 1234
-; CHECK-NEXT: add i64 %mul, %x
-; CHECK-NEXT: ret i64 %sub
-
-  %mul = mul i64 %y, -1234
-  %sub = sub i64 %x, %mul
-  ret i64 %sub
-}
-
 ; Canonicalize (x - -0.1234 * y)
 define double @test5(double %x, double %y) {
 ; CHECK-LABEL: @test5
@@ -156,3 +144,13 @@ define double @test12(double %x, double %y) {
   %add = fadd double %div, %x
   ret double %add
 }
+
+; Don't create an NSW violation
+define i4 @test13(i4 %x) {
+; CHECK-LABEL: @test13
+; CHECK-NEXT: %[[mul:.*]] = mul nsw i4 %x, -2
+; CHECK-NEXT: %[[add:.*]] = add i4 %[[mul]], 3
+  %mul = mul nsw i4 %x, -2
+  %add = add i4 %mul, 3
+  ret i4 %add
+}
diff --git a/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll b/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll
new file mode 100644
index 0000000000000..5913db21fcf38
--- /dev/null
+++ b/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll
@@ -0,0 +1,77 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+
+declare void @foo()
+declare i8 addrspace(1)* @some_function()
+declare void @some_function_consumer(i8 addrspace(1)*)
+declare dereferenceable(4) i8 addrspace(1)* @some_function_ret_deref()
+; CHECK: declare i8 addrspace(1)* @some_function_ret_deref()
+
+define i8 addrspace(1)* @test_deref_arg(i8 addrspace(1)* dereferenceable(4) %a) gc "statepoint-example" {
+; CHECK: define i8 addrspace(1)* @test_deref_arg(i8 addrspace(1)* %a)
+entry:
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_deref_or_null_arg(i8 addrspace(1)* dereferenceable_or_null(4) %a) gc "statepoint-example" {
+; CHECK: define i8 addrspace(1)* @test_deref_or_null_arg(i8 addrspace(1)* %a)
+entry:
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_deref_retval() gc "statepoint-example" {
+; CHECK-LABEL: @test_deref_retval(
+entry:
+  %a = call dereferenceable(4) i8 addrspace(1)* @some_function()
+; CHECK: %a = call i8 addrspace(1)* @some_function()
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_deref_or_null_retval() gc "statepoint-example" {
+; CHECK-LABEL: @test_deref_or_null_retval(
+entry:
+  %a = call dereferenceable_or_null(4) i8 addrspace(1)* @some_function()
+; CHECK: %a = call i8 addrspace(1)* @some_function()
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %a
+}
+
+define i8 @test_md(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_md(
+ entry:
+; CHECK: %tmp = load i8, i8 addrspace(1)* %ptr, !tbaa !0
+  %tmp = load i8, i8 addrspace(1)* %ptr, !tbaa !0
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 %tmp
+}
+
+define i8 addrspace(1)* @test_decl_only_attribute(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_decl_only_attribute(
+entry:
+; No change here, but the prototype of some_function_ret_deref should have changed.
+; CHECK: call i8 addrspace(1)* @some_function_ret_deref()
+  %a = call i8 addrspace(1)* @some_function_ret_deref()
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_callsite_arg_attribute(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_callsite_arg_attribute(
+entry:
+; CHECK: call void @some_function_consumer(i8 addrspace(1)* %ptr)
+  call void @some_function_consumer(i8 addrspace(1)* dereferenceable(4) %ptr)
+  call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %ptr
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
+!0 = !{!1, !1, i64 0, i64 1}
+!1 = !{!"red", !2}
+!2 = !{!"blue"}
+
+; CHECK: !0 = !{!1, !1, i64 0}
+; CHECK: !1 = !{!"red", !2}
+; CHECK: !2 = !{!"blue"}
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/R600/lit.local.cfg b/test/Transforms/SeparateConstOffsetFromGEP/R600/lit.local.cfg
new file mode 100644
index 0000000000000..4086e8d681c38
--- /dev/null
+++ b/test/Transforms/SeparateConstOffsetFromGEP/R600/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'R600' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/R600/split-gep-and-gvn-addrspace-addressing-modes.ll b/test/Transforms/SeparateConstOffsetFromGEP/R600/split-gep-and-gvn-addrspace-addressing-modes.ll
new file mode 100644
index 0000000000000..527634db0f5b8
--- /dev/null
+++ b/test/Transforms/SeparateConstOffsetFromGEP/R600/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -0,0 +1,94 @@
+; RUN: opt -mtriple=amdgcn-- -S -separate-const-offset-from-gep -reassociate-geps-verify-no-dead-code -gvn < %s | FileCheck -check-prefix=IR %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+@array = internal addrspace(2) constant [4096 x [32 x float]] zeroinitializer, align 4
+
+; IR-LABEL: @sum_of_array(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float, float addrspace(2)* [[BASE_PTR]], i64 1
+; IR: getelementptr float, float addrspace(2)* [[BASE_PTR]], i64 32
+; IR: getelementptr float, float addrspace(2)* [[BASE_PTR]], i64 33
+define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+  %tmp = sext i32 %y to i64
+  %tmp1 = sext i32 %x to i64
+  %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
+  %tmp4 = load float, float addrspace(2)* %tmp2, align 4
+  %tmp5 = fadd float %tmp4, 0.000000e+00
+  %tmp6 = add i32 %y, 1
+  %tmp7 = sext i32 %tmp6 to i64
+  %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp7
+  %tmp10 = load float, float addrspace(2)* %tmp8, align 4
+  %tmp11 = fadd float %tmp5, %tmp10
+  %tmp12 = add i32 %x, 1
+  %tmp13 = sext i32 %tmp12 to i64
+  %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp
+  %tmp16 = load float, float addrspace(2)* %tmp14, align 4
+  %tmp17 = fadd float %tmp11, %tmp16
+  %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp7
+  %tmp20 = load float, float addrspace(2)* %tmp18, align 4
+  %tmp21 = fadd float %tmp17, %tmp20
+  store float %tmp21, float addrspace(1)* %output, align 4
+  ret void
+}
+
+@array2 = internal addrspace(2) constant [4096 x [4 x float]] zeroinitializer, align 4
+
+; Some of the indices go over the maximum mubuf offset, so don't split them.
+
+; IR-LABEL: @sum_of_array_over_max_mubuf_offset(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float, float addrspace(2)* [[BASE_PTR]], i64 255
+; IR: add i32 %x, 256
+; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+  %tmp = sext i32 %y to i64
+  %tmp1 = sext i32 %x to i64
+  %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
+  %tmp4 = load float, float addrspace(2)* %tmp2, align 4
+  %tmp5 = fadd float %tmp4, 0.000000e+00
+  %tmp6 = add i32 %y, 255
+  %tmp7 = sext i32 %tmp6 to i64
+  %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp7
+  %tmp10 = load float, float addrspace(2)* %tmp8, align 4
+  %tmp11 = fadd float %tmp5, %tmp10
+  %tmp12 = add i32 %x, 256
+  %tmp13 = sext i32 %tmp12 to i64
+  %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp
+  %tmp16 = load float, float addrspace(2)* %tmp14, align 4
+  %tmp17 = fadd float %tmp11, %tmp16
+  %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp7
+  %tmp20 = load float, float addrspace(2)* %tmp18, align 4
+  %tmp21 = fadd float %tmp17, %tmp20
+  store float %tmp21, float addrspace(1)* %output, align 4
+  ret void
+}
+
+
+@lds_array = internal addrspace(3) global [4096 x [4 x float]] undef, align 4
+
+; DS instructions have a larger immediate offset, so make sure these are OK.
+; IR-LABEL: @sum_of_lds_array_over_max_mubuf_offset(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %{{[a-zA-Z0-9]+}}, i32 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i32 255
+; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i32 16128
+; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i32 16383
+define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+  %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
+  %tmp4 = load float, float addrspace(3)* %tmp2, align 4
+  %tmp5 = fadd float %tmp4, 0.000000e+00
+  %tmp6 = add i32 %y, 255
+  %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %tmp6
+  %tmp10 = load float, float addrspace(3)* %tmp8, align 4
+  %tmp11 = fadd float %tmp5, %tmp10
+  %tmp12 = add i32 %x, 4032
+  %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %tmp12, i32 %y
+  %tmp16 = load float, float addrspace(3)* %tmp14, align 4
+  %tmp17 = fadd float %tmp11, %tmp16
+  %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %tmp12, i32 %tmp6
+  %tmp20 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp21 = fadd float %tmp17, %tmp20
+  store float %tmp21, float addrspace(1)* %output, align 4
+  ret void
+}
diff --git a/test/Transforms/Sink/convergent.ll b/test/Transforms/Sink/convergent.ll
new file mode 100644
index 0000000000000..49207dbc99276
--- /dev/null
+++ b/test/Transforms/Sink/convergent.ll
@@ -0,0 +1,24 @@
+; RUN: opt -sink -S < %s | FileCheck %s
+
+; Verify that IR sinking does not move convergent operations to
+; blocks that are not control equivalent.
+
+; CHECK: define i32 @foo
+; CHECK: entry
+; CHECK-NEXT: call i32 @bar
+; CHECK-NEXT: br i1 %arg
+
+define i32 @foo(i1 %arg) {
+entry:
+  %c = call i32 @bar() readonly convergent
+  br i1 %arg, label %then, label %end
+
+then:
+  ret i32 %c
+
+end:
+  ret i32 0
+}
+
+declare i32 @bar() readonly convergent
+