47 files changed, 3418 insertions, 17 deletions
diff --git a/test/Transforms/SLPVectorizer/X86/barriercall.ll b/test/Transforms/SLPVectorizer/X86/barriercall.ll
index 04eb8f919bc72..bba285526a4bc 100644
--- a/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-;CHECK: @foo
+;CHECK-LABEL: @foo(
 ;CHECK: store <4 x i32>
 ;CHECK: ret
 define i32 @foo(i32* nocapture %A, i32 %n) {
diff --git a/test/Transforms/SLPVectorizer/X86/cast.ll b/test/Transforms/SLPVectorizer/X86/cast.ll
index 344dbbca2c572..e340fba351a38 100644
--- a/test/Transforms/SLPVectorizer/X86/cast.ll
+++ b/test/Transforms/SLPVectorizer/X86/cast.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 ;     A[2] = B[2];
 ;     A[3] = B[3];
 ; }
-;CHECK: @foo
+;CHECK-LABEL: @foo(
 ;CHECK: load <4 x i8>
 ;CHECK: sext
 ;CHECK: store <4 x i32>
diff --git a/test/Transforms/SLPVectorizer/X86/cmp_sel.ll b/test/Transforms/SLPVectorizer/X86/cmp_sel.ll
new file mode 100644
index 0000000000000..0c124a75d4175
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/cmp_sel.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; int foo(double * restrict A, double * restrict B, double G) {
+;   A[0] = (B[10] ? G : 1);
+;   A[1] = (B[11] ? G : 1);
+; }
+
+;CHECK-LABEL: @foo(
+;CHECK: load <2 x double>
+;CHECK: fcmp une <2 x double>
+;CHECK: select <2 x i1>
+;CHECK: store <2 x double>
+;CHECK: ret i32 undef
+define i32 @foo(double* noalias nocapture %A, double* noalias nocapture %B, double %G) {
+entry:
+  %arrayidx = getelementptr inbounds double* %B, i64 10
+  %0 = load double* %arrayidx, align 8
+  %tobool = fcmp une double %0, 0.000000e+00
+  %cond = select i1 %tobool, double %G, double 1.000000e+00
+  store double %cond, double* %A, align 8
+  %arrayidx2 = getelementptr inbounds double* %B, i64 11
+  %1 = load double* %arrayidx2, align 8
+  %tobool3 = fcmp une double %1, 0.000000e+00
+  %cond7 = select i1 %tobool3, double %G, double 1.000000e+00
+  %arrayidx8 = getelementptr inbounds double* %A, i64 1
+  store double %cond7, double* %arrayidx8, align 8
+  ret i32 undef
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
index 05f8e616bb8e2..9653d18db5667 100644
--- a/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 
 @.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1
 
-;CHECK: @reduce_compare
+;CHECK-LABEL: @reduce_compare(
 ;CHECK: load <2 x double>
 ;CHECK: fmul <2 x double>
 ;CHECK: fmul <2 x double>
diff --git a/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
new file mode 100644
index 0000000000000..51b1c08fb36f8
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334 = type { %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333, i16*, i8*, i8*, i32, i32, i64, i64, i32, i32, i32, [4 x i32], i32, i32, i32, i32, i32, [20 x i8] }
+%struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 }
+
+define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p) {
+entry:
+  %range20.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 4
+  %code21.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 5
+  br label %do.body66.i
+
+do.body66.i:                                      ; preds = %do.cond.i, %entry
+  %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ]
+  %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ]
+  %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i
+  %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i
+  br i1 undef, label %do.cond.i, label %if.else.i
+
+if.else.i:                                        ; preds = %do.body66.i
+  %sub91.i = sub i32 %.range.2.i, undef
+  %sub92.i = sub i32 %.code.2.i, undef
+  br label %do.cond.i
+
+do.cond.i:                                        ; preds = %if.else.i, %do.body66.i
+  %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ]
+  %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ]
+  br i1 undef, label %do.body66.i, label %do.end1006.i
+
+do.end1006.i:                                     ; preds = %do.cond.i
+  %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i
+  %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i
+  store i32 %.range.4.i, i32* %range20.i, align 4
+  store i32 %.code.4.i, i32* %code21.i, align 4
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
new file mode 100644
index 0000000000000..389892115cedc
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 }
+
+define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* nocapture %info) {
+entry:
+  br i1 undef, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.else:                                          ; preds = %entry
+  %m_numConstraintRows4 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 0
+  %nub5 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 1
+  br i1 undef, label %land.lhs.true.i.1, label %if.then7.1
+
+land.lhs.true.i.1:                                ; preds = %if.else
+  br i1 undef, label %for.inc.1, label %if.then7.1
+
+if.then7.1:                                       ; preds = %land.lhs.true.i.1, %if.else
+  %inc.1 = add nsw i32 0, 1
+  store i32 %inc.1, i32* %m_numConstraintRows4, align 4
+  %dec.1 = add nsw i32 6, -1
+  store i32 %dec.1, i32* %nub5, align 4
+  br label %for.inc.1
+
+for.inc.1:                                        ; preds = %if.then7.1, %land.lhs.true.i.1
+  %0 = phi i32 [ %dec.1, %if.then7.1 ], [ 6, %land.lhs.true.i.1 ]
+  %1 = phi i32 [ %inc.1, %if.then7.1 ], [ 0, %land.lhs.true.i.1 ]
+  %inc.2 = add nsw i32 %1, 1
+  store i32 %inc.2, i32* %m_numConstraintRows4, align 4
+  %dec.2 = add nsw i32 %0, -1
+  store i32 %dec.2, i32* %nub5, align 4
+  unreachable
+}
+
+%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332 = type { float, [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, [4 x float], float, float, [4 x float], float, float, [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330] }
+%class.btVector3.5.30.65.90.115.140.175.185.260.280.330 = type { [4 x float] }
+%class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 }
+
+define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this) {
+entry:
+  %arrayidx26 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 1
+  %arrayidx36 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 2
+  %0 = load float* %arrayidx36, align 4
+  %add587 = fadd float undef, undef
+  %sub600 = fsub float %add587, undef
+  store float %sub600, float* undef, align 4
+  %sub613 = fsub float %add587, %sub600
+  store float %sub613, float* %arrayidx26, align 4
+  %add626 = fadd float %0, undef
+  %sub639 = fsub float %add626, undef
+  %sub652 = fsub float %add626, %sub639
+  store float %sub652, float* %arrayidx36, align 4
+  br i1 undef, label %if.else1609, label %if.then1595
+
+if.then1595:                                      ; preds = %entry
+  br i1 undef, label %return, label %for.body.lr.ph.i.i1702
+
+for.body.lr.ph.i.i1702:                           ; preds = %if.then1595
+  unreachable
+
+if.else1609:                                      ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %if.then1595
+  ret void
+}
+
+define void @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE() {
+entry:
+  %add8.i2343 = fadd float undef, undef
+  %add8.i2381 = fadd float undef, undef
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %return, label %if.end111
+
+if.end111:                                        ; preds = %if.end
+  br i1 undef, label %return, label %if.end136
+
+if.end136:                                        ; preds = %if.end111
+  br i1 undef, label %return, label %if.end162
+
+if.end162:                                        ; preds = %if.end136
+  br i1 undef, label %return, label %if.end189
+
+if.end189:                                        ; preds = %if.end162
+  br i1 undef, label %return, label %if.end216
+
+if.end216:                                        ; preds = %if.end189
+  br i1 undef, label %if.then218, label %if.end225
+
+if.then218:                                       ; preds = %if.end216
+  br label %if.end225
+
+if.end225:                                        ; preds = %if.then218, %if.end216
+  br i1 undef, label %return, label %if.end248
+
+if.end248:                                        ; preds = %if.end225
+  br i1 undef, label %return, label %if.end304
+
+if.end304:                                        ; preds = %if.end248
+  %mul341 = fmul float undef, %add8.i2343
+  %mul344 = fmul float undef, %add8.i2381
+  %sub345 = fsub float %mul341, %mul344
+  br i1 undef, label %return, label %if.end361
+
+if.end361:                                        ; preds = %if.end304
+  %mul364 = fmul float %add8.i2381, %add8.i2381
+  br i1 undef, label %if.then370, label %if.end395
+
+if.then370:                                       ; preds = %if.end361
+  br i1 undef, label %if.then374, label %if.end395
+
+if.then374:                                       ; preds = %if.then370
+  %cmp392 = fcmp olt float %sub345, 0.000000e+00
+  br label %if.end395
+
+if.end395:                                        ; preds = %if.then374, %if.then370, %if.end361
+  unreachable
+
+return:                                           ; preds = %if.end304, %if.end248, %if.end225, %if.end189, %if.end162, %if.end136, %if.end111, %if.end, %entry
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
new file mode 100644
index 0000000000000..25c65457946b8
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -0,0 +1,84 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113 = type { [4 x float] }
+
+; Function Attrs: ssp uwtable
+define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(%class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices) #0 align 2 {
+entry:
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %if.end22.2, %if.end
+  br i1 undef, label %if.then17.1, label %if.end22.1
+
+for.end36:                                        ; preds = %if.end22.2
+  br label %for.body144
+
+for.body144:                                      ; preds = %for.body144, %for.end36
+  br i1 undef, label %for.end227, label %for.body144
+
+for.end227:                                       ; preds = %for.body144
+  br i1 undef, label %for.end271, label %for.body233
+
+for.body233:                                      ; preds = %for.body233, %for.end227
+  br i1 undef, label %for.body233, label %for.end271
+
+for.end271:                                       ; preds = %for.body233, %for.end227
+  %0 = phi float [ 0x47EFFFFFE0000000, %for.end227 ], [ undef, %for.body233 ]
+  %1 = phi float [ 0x47EFFFFFE0000000, %for.end227 ], [ undef, %for.body233 ]
+  %sub275 = fsub float undef, %1
+  %sub279 = fsub float undef, %0
+  br i1 undef, label %if.then291, label %return
+
+if.then291:                                       ; preds = %for.end271
+  %mul292 = fmul float %sub275, 5.000000e-01
+  %add294 = fadd float %1, %mul292
+  %mul295 = fmul float %sub279, 5.000000e-01
+  %add297 = fadd float %0, %mul295
+  br i1 undef, label %if.end332, label %if.else319
+
+if.else319:                                       ; preds = %if.then291
+  br i1 undef, label %if.then325, label %if.end327
+
+if.then325:                                       ; preds = %if.else319
+  br label %if.end327
+
+if.end327:                                        ; preds = %if.then325, %if.else319
+  br i1 undef, label %if.then329, label %if.end332
+
+if.then329:                                       ; preds = %if.end327
+  br label %if.end332
+
+if.end332:                                        ; preds = %if.then329, %if.end327, %if.then291
+  %dx272.1 = phi float [ %sub275, %if.then329 ], [ %sub275, %if.end327 ], [ 0x3F847AE140000000, %if.then291 ]
+  %dy276.1 = phi float [ undef, %if.then329 ], [ undef, %if.end327 ], [ 0x3F847AE140000000, %if.then291 ]
+  %sub334 = fsub float %add294, %dx272.1
+  %sub338 = fsub float %add297, %dy276.1
+  %arrayidx.i.i606 = getelementptr inbounds %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices, i64 0, i32 0, i64 0
+  store float %sub334, float* %arrayidx.i.i606, align 4
+  %arrayidx3.i607 = getelementptr inbounds %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices, i64 0, i32 0, i64 1
+  store float %sub338, float* %arrayidx3.i607, align 4
+  br label %return
+
+return:                                           ; preds = %if.end332, %for.end271, %entry
+  ret void
+
+if.then17.1:                                      ; preds = %for.body
+  br label %if.end22.1
+
+if.end22.1:                                       ; preds = %if.then17.1, %for.body
+  br i1 undef, label %if.then17.2, label %if.end22.2
+
+if.then17.2:                                      ; preds = %if.end22.1
+  br label %if.end22.2
+
+if.end22.2:                                       ; preds = %if.then17.2, %if.end22.1
+  br i1 undef, label %for.end36, label %for.body
+}
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
new file mode 100644
index 0000000000000..ce0159071c60a
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731" = type { double*, double*, double*, double** }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* nocapture %__last) {
+entry:
+  %_M_cur2.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 0
+  %0 = load double** %_M_cur2.i.i, align 8
+  %_M_first3.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 1
+  %_M_cur2.i.i81 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 0
+  %1 = load double** %_M_cur2.i.i81, align 8
+  %_M_first3.i.i83 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 1
+  %2 = load double** %_M_first3.i.i83, align 8
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader
+
+while.cond.i.preheader:                           ; preds = %entry
+  br label %while.cond.i
+
+while.cond.i:                                     ; preds = %while.body.i, %while.cond.i.preheader
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i
+
+while.body.i:                                     ; preds = %while.cond.i
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i
+
+_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; preds = %while.body.i, %while.cond.i, %entry
+  %3 = phi double* [ %2, %entry ], [ %2, %while.cond.i ], [ undef, %while.body.i ]
+  %4 = phi double* [ %0, %entry ], [ %1, %while.cond.i ], [ undef, %while.body.i ]
+  store double* %4, double** %_M_cur2.i.i, align 8
+  store double* %3, double** %_M_first3.i.i, align 8
+  br i1 undef, label %if.then.i55, label %while.cond
+
+if.then.i55:                                      ; preds = %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %if.then.i55, %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
+  br label %while.cond
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
new file mode 100644
index 0000000000000..e11be488f795b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @main() #0 {
+entry:
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry
+  unreachable
+
+while.end:                                        ; preds = %entry
+  br i1 undef, label %for.end80, label %for.body75.lr.ph
+
+for.body75.lr.ph:                                 ; preds = %while.end
+  br label %for.body75
+
+for.body75:                                       ; preds = %for.body75, %for.body75.lr.ph
+  br label %for.body75
+
+for.end80:                                        ; preds = %while.end
+  br i1 undef, label %for.end300, label %for.body267.lr.ph
+
+for.body267.lr.ph:                                ; preds = %for.end80
+  br label %for.body267
+
+for.body267:                                      ; preds = %for.body267, %for.body267.lr.ph
+  %s.71010 = phi double [ 0.000000e+00, %for.body267.lr.ph ], [ %add297, %for.body267 ]
+  %mul269 = fmul double undef, undef
+  %mul270 = fmul double %mul269, %mul269
+  %add282 = fadd double undef, undef
+  %mul283 = fmul double %mul269, %add282
+  %add293 = fadd double undef, undef
+  %mul294 = fmul double %mul270, %add293
+  %add295 = fadd double undef, %mul294
+  %div296 = fdiv double %mul283, %add295
+  %add297 = fadd double %s.71010, %div296
+  br i1 undef, label %for.body267, label %for.end300
+
+for.end300:                                       ; preds = %for.body267, %for.end80
+  unreachable
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
new file mode 100644
index 0000000000000..c02e1fa607d50
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -0,0 +1,91 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @RCModelEstimator() {
+entry:
+  br i1 undef, label %for.body.lr.ph, label %for.end.thread
+
+for.end.thread:                                   ; preds = %entry
+  unreachable
+
+for.body.lr.ph:                                   ; preds = %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %for.body.lr.ph
+  br i1 undef, label %for.body3, label %if.end103
+
+for.cond14.preheader:                             ; preds = %for.inc11
+  br i1 undef, label %for.body16.lr.ph, label %if.end103
+
+for.body16.lr.ph:                                 ; preds = %for.cond14.preheader
+  br label %for.body16
+
+for.body3:                                        ; preds = %for.inc11, %for.end
+  br i1 undef, label %if.then7, label %for.inc11
+
+if.then7:                                         ; preds = %for.body3
+  br label %for.inc11
+
+for.inc11:                                        ; preds = %if.then7, %for.body3
+  br i1 false, label %for.cond14.preheader, label %for.body3
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  br i1 undef, label %for.end39, label %for.body16
+
+for.end39:                                        ; preds = %for.body16
+  br i1 undef, label %if.end103, label %for.cond45.preheader
+
+for.cond45.preheader:                             ; preds = %for.end39
+  br i1 undef, label %if.then88, label %if.else
+
+if.then88:                                        ; preds = %for.cond45.preheader
+  %mul89 = fmul double 0.000000e+00, 0.000000e+00
+  %mul90 = fmul double 0.000000e+00, 0.000000e+00
+  %sub91 = fsub double %mul89, %mul90
+  %div92 = fdiv double %sub91, undef
+  %mul94 = fmul double 0.000000e+00, 0.000000e+00
+  %mul95 = fmul double 0.000000e+00, 0.000000e+00
+  %sub96 = fsub double %mul94, %mul95
+  %div97 = fdiv double %sub96, undef
+  br label %if.end103
+
+if.else:                                          ; preds = %for.cond45.preheader
+  br label %if.end103
+
+if.end103:                                        ; preds = %if.else, %if.then88, %for.end39, %for.cond14.preheader, %for.end
+  %0 = phi double [ 0.000000e+00, %for.end39 ], [ %div97, %if.then88 ], [ 0.000000e+00, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ]
+  %1 = phi double [ undef, %for.end39 ], [ %div92, %if.then88 ], [ undef, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ]
+  ret void
+}
+
+
+define void @intrapred_luma() {
+entry:
+  %conv153 = trunc i32 undef to i16
+  %arrayidx154 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 12
+  store i16 %conv153, i16* %arrayidx154, align 8
+  %arrayidx155 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 11
+  store i16 %conv153, i16* %arrayidx155, align 2
+  %arrayidx156 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 10
+  store i16 %conv153, i16* %arrayidx156, align 4
+  ret void
+}
+
+define fastcc void @dct36(double* %inbuf) {
+entry:
+  %arrayidx41 = getelementptr inbounds double* %inbuf, i64 2
+  %arrayidx44 = getelementptr inbounds double* %inbuf, i64 1
+  %0 = load double* %arrayidx44, align 8
+  %add46 = fadd double %0, undef
+  store double %add46, double* %arrayidx41, align 8
+  %1 = load double* %inbuf, align 8
+  %add49 = fadd double %1, %0
+  store double %add49, double* %arrayidx44, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
new file mode 100644
index 0000000000000..d6915e2dc5d68
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -0,0 +1,107 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @main() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.end44, %entry
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %if.then25, %for.body
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.inc21, %for.cond4.preheader
+  br label %for.body12
+
+for.body12:                                       ; preds = %if.end, %for.body6
+  %fZImg.069 = phi double [ undef, %for.body6 ], [ %add19, %if.end ]
+  %fZReal.068 = phi double [ undef, %for.body6 ], [ %add20, %if.end ]
+  %mul13 = fmul double %fZReal.068, %fZReal.068
+  %mul14 = fmul double %fZImg.069, %fZImg.069
+  %add15 = fadd double %mul13, %mul14
+  %cmp16 = fcmp ogt double %add15, 4.000000e+00
+  br i1 %cmp16, label %for.inc21, label %if.end
+
+if.end:                                           ; preds = %for.body12
+  %mul18 = fmul double undef, %fZImg.069
+  %add19 = fadd double undef, %mul18
+  %sub = fsub double %mul13, %mul14
+  %add20 = fadd double undef, %sub
+  br i1 undef, label %for.body12, label %for.inc21
+
+for.inc21:                                        ; preds = %if.end, %for.body12
+  br i1 undef, label %for.end23, label %for.body6
+
+for.end23:                                        ; preds = %for.inc21
+  br i1 undef, label %if.then25, label %if.then26
+
+if.then25:                                        ; preds = %for.end23
+  br i1 undef, label %for.end44, label %for.cond4.preheader
+
+if.then26:                                        ; preds = %for.end23
+  unreachable
+
+for.end44:                                        ; preds = %if.then25
+  br i1 undef, label %for.end48, label %for.body
+
+for.end48:                                        ; preds = %for.end44
+  ret void
+}
+
+%struct.hoge = type { double, double, double}
+
+define void @zot(%struct.hoge* %arg) {
+bb:
+  %tmp = load double* undef, align 8
+  %tmp1 = fsub double %tmp, undef
+  %tmp2 = load double* undef, align 8
+  %tmp3 = fsub double %tmp2, undef
+  %tmp4 = fmul double %tmp3, undef
+  %tmp5 = fmul double %tmp3, undef
+  %tmp6 = fsub double %tmp5, undef
+  %tmp7 = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 1
+  store double %tmp6, double* %tmp7, align 8
+  %tmp8 = fmul double %tmp1, undef
+  %tmp9 = fsub double %tmp8, undef
+  %tmp10 = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 2
+  store double %tmp9, double* %tmp10, align 8
+  br i1 undef, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb
+  br label %bb14
+
+bb12:                                             ; preds = %bb
+  %tmp13 = fmul double undef, %tmp2
+  br label %bb14
+
+bb14:                                             ; preds = %bb12, %bb11
+  ret void
+}
+
+
+%struct.rc4_state.0.24 = type { i32, i32, [256 x i32] }
+
+define void @rc4_crypt(%struct.rc4_state.0.24* nocapture %s) {
+entry:
+  %x1 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 0
+  %y2 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 1
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.045 = phi i32 [ %conv4, %for.body ], [ undef, %entry ]
+  %conv4 = and i32 undef, 255
+  %conv7 = and i32 undef, 255
+  %idxprom842 = zext i32 %conv7 to i64
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ undef, %entry ], [ %conv4, %for.body ]
+  %y.0.lcssa = phi i32 [ undef, %entry ], [ %conv7, %for.body ]
+  store i32 %x.0.lcssa, i32* %x1, align 4
+  store i32 %y.0.lcssa, i32* %y2, align 4
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
new file mode 100644
index 0000000000000..8da3c34a0279c
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.DState = type { i32, i32 }
+
+@b = common global %struct.DState zeroinitializer, align 4
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+define i32 @fn1() {
+entry:
+  %0 = load i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 0), align 4
+  %1 = load i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 1), align 4
+  %2 = load i32* @d, align 4
+  %cond = icmp eq i32 %2, 0
+  br i1 %cond, label %sw.bb, label %save_state_and_return
+
+sw.bb:                                            ; preds = %entry
+  %3 = load i32* @c, align 4
+  %and = and i32 %3, 7
+  store i32 %and, i32* @a, align 4
+  switch i32 %and, label %if.end [
+    i32 7, label %save_state_and_return
+    i32 0, label %save_state_and_return
+  ]
+
+if.end:                                           ; preds = %sw.bb
+  br label %save_state_and_return
+
+save_state_and_return:                            ; preds = %sw.bb, %sw.bb, %if.end, %entry
+  %t.0 = phi i32 [ 0, %if.end ], [ %0, %entry ], [ %0, %sw.bb ], [ %0, %sw.bb ]
+  %f.0 = phi i32 [ 0, %if.end ], [ %1, %entry ], [ 0, %sw.bb ], [ 0, %sw.bb ]
+  store i32 %t.0, i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 0), align 4
+  store i32 %f.0, i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 1), align 4
+  ret i32 undef
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
new file mode 100644
index 0000000000000..05415456cf0fd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] }
+
+define void @SIM4() {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %lor.lhs.false
+  br i1 undef, label %for.end605, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc603, %for.body.lr.ph
+  br i1 undef, label %for.inc603, label %if.end12
+
+if.end12:                                         ; preds = %for.body
+  br i1 undef, label %land.lhs.true, label %land.lhs.true167
+
+land.lhs.true:                                    ; preds = %if.end12
+  br i1 undef, label %if.then17, label %land.lhs.true167
+
+if.then17:                                        ; preds = %land.lhs.true
+  br i1 undef, label %if.end98, label %land.rhs.lr.ph
+
+land.rhs.lr.ph:                                   ; preds = %if.then17
+  unreachable
+
+if.end98:                                         ; preds = %if.then17
+  %from299 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1
+  br i1 undef, label %land.lhs.true167, label %if.then103
+
+if.then103:                                       ; preds = %if.end98
+  %.sub100 = select i1 undef, i32 250, i32 undef
+  %mul114 = shl nsw i32 %.sub100, 2
+  %from1115 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0
+  %cond125 = select i1 undef, i32 undef, i32 %mul114
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %land.rhs.i874, %if.then103
+  %row.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %.sub100, %if.then103 ]
+  %col.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %cond125, %if.then103 ]
+  br i1 undef, label %land.rhs.i874, label %for.end.i
+
+land.rhs.i874:                                    ; preds = %for.cond.i
+  br i1 undef, label %for.cond.i, label %for.end.i
+
+for.end.i:                                        ; preds = %land.rhs.i874, %for.cond.i
+  br i1 undef, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %for.end.i
+  %add14.i = add nsw i32 %row.0.i, undef
+  %add15.i = add nsw i32 %col.0.i, undef
+  br label %extend_bw.exit
+
+if.end.i:                                         ; preds = %for.end.i
+  %add16.i = add i32 %cond125, %.sub100
+  %cmp26514.i = icmp slt i32 %add16.i, 0
+  br i1 %cmp26514.i, label %for.end33.i, label %for.body28.lr.ph.i
+
+for.body28.lr.ph.i:                               ; preds = %if.end.i
+  br label %for.end33.i
+
+for.end33.i:                                      ; preds = %for.body28.lr.ph.i, %if.end.i
+  br i1 undef, label %for.end58.i, label %for.body52.lr.ph.i
+
+for.body52.lr.ph.i:                               ; preds = %for.end33.i
+  br label %for.end58.i
+
+for.end58.i:                                      ; preds = %for.body52.lr.ph.i, %for.end33.i
+  br label %while.cond260.i
+
+while.cond260.i:                                  ; preds = %land.rhs263.i, %for.end58.i
+  br i1 undef, label %land.rhs263.i, label %while.end275.i
+
+land.rhs263.i:                                    ; preds = %while.cond260.i
+  br i1 undef, label %while.cond260.i, label %while.end275.i
+
+while.end275.i:                                   ; preds = %land.rhs263.i, %while.cond260.i
+  br label %extend_bw.exit
+
+extend_bw.exit:                                   ; preds = %while.end275.i, %if.then.i
+  %add14.i1262 = phi i32 [ %add14.i, %if.then.i ], [ undef, %while.end275.i ]
+  %add15.i1261 = phi i32 [ %add15.i, %if.then.i ], [ undef, %while.end275.i ]
+  br i1 false, label %if.then157, label %land.lhs.true167
+
+if.then157:                                       ; preds = %extend_bw.exit
+  %add158 = add nsw i32 %add14.i1262, 1
+  store i32 %add158, i32* %from299, align 4
+  %add160 = add nsw i32 %add15.i1261, 1
+  store i32 %add160, i32* %from1115, align 4
+  br label %land.lhs.true167
+
+land.lhs.true167:                                 ; preds = %if.then157, %extend_bw.exit, %if.end98, %land.lhs.true, %if.end12
+  unreachable
+
+for.inc603:                                       ; preds = %for.body
+  br i1 undef, label %for.body, label %for.end605
+
+for.end605:                                       ; preds = %for.inc603, %if.end
+  unreachable
+
+return:                                           ; preds = %lor.lhs.false, %entry
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
new file mode 100644
index 0000000000000..915c41bb9c593
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -0,0 +1,105 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.Ray.5.11.53.113.119.137.149.185.329.389.416 = type { %struct.Vec.0.6.48.108.114.132.144.180.324.384.414, %struct.Vec.0.6.48.108.114.132.144.180.324.384.414 }
+%struct.Vec.0.6.48.108.114.132.144.180.324.384.414 = type { double, double, double }
+
+; Function Attrs: ssp uwtable
+define void @main() #0 {
+entry:
+  br i1 undef, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  unreachable
+
+cond.end:                                         ; preds = %entry
+  br label %invoke.cont
+
+invoke.cont:                                      ; preds = %invoke.cont, %cond.end
+  br i1 undef, label %arrayctor.cont, label %invoke.cont
+
+arrayctor.cont:                                   ; preds = %invoke.cont
+  %agg.tmp99208.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 0
+  %agg.tmp99208.sroa.1.8.idx388 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 1
+  %agg.tmp101211.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 0
+  %agg.tmp101211.sroa.1.8.idx390 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 1
+  br label %for.cond36.preheader
+
+for.cond36.preheader:                             ; preds = %_Z5clampd.exit.1, %arrayctor.cont
+  br i1 undef, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1
+
+cond.false51.us:                                  ; preds = %for.body42.lr.ph.us
+  unreachable
+
+cond.true48.us:                                   ; preds = %for.body42.lr.ph.us
+  br i1 undef, label %cond.true63.us, label %cond.false66.us
+
+cond.false66.us:                                  ; preds = %cond.true48.us
+  %add.i276.us = fadd double 0.000000e+00, undef
+  %add.i264.us = fadd double %add.i276.us, 0.000000e+00
+  %add4.i267.us = fadd double undef, 0xBFA5CC2D1960285F
+  %mul.i254.us = fmul double %add.i264.us, 1.400000e+02
+  %mul2.i256.us = fmul double %add4.i267.us, 1.400000e+02
+  %add.i243.us = fadd double %mul.i254.us, 5.000000e+01
+  %add4.i246.us = fadd double %mul2.i256.us, 5.200000e+01
+  %mul.i.i.us = fmul double undef, %add.i264.us
+  %mul2.i.i.us = fmul double undef, %add4.i267.us
+  store double %add.i243.us, double* %agg.tmp99208.sroa.0.0.idx, align 8
+  store double %add4.i246.us, double* %agg.tmp99208.sroa.1.8.idx388, align 8
+  store double %mul.i.i.us, double* %agg.tmp101211.sroa.0.0.idx, align 8
+  store double %mul2.i.i.us, double* %agg.tmp101211.sroa.1.8.idx390, align 8
+  unreachable
+
+cond.true63.us:                                   ; preds = %cond.true48.us
+  unreachable
+
+for.body42.lr.ph.us:                              ; preds = %for.cond36.preheader
+  br i1 undef, label %cond.true48.us, label %cond.false51.us
+
+_Z5clampd.exit.1:                                 ; preds = %for.cond36.preheader
+  br label %for.cond36.preheader
+}
+
+
+%struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601 = type { %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600, %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 }
+%struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 = type { double, double, double }
+
+define void @_Z8radianceRK3RayiPt() #0 {
+entry:
+  br i1 undef, label %if.then78, label %if.then38
+
+if.then38:                                        ; preds = %entry
+  %mul.i.i790 = fmul double undef, undef
+  %mul3.i.i792 = fmul double undef, undef
+  %mul.i764 = fmul double undef, %mul3.i.i792
+  %mul4.i767 = fmul double undef, undef
+  %sub.i768 = fsub double %mul.i764, %mul4.i767
+  %mul6.i770 = fmul double undef, %mul.i.i790
+  %mul9.i772 = fmul double undef, %mul3.i.i792
+  %sub10.i773 = fsub double %mul6.i770, %mul9.i772
+  %mul.i736 = fmul double undef, %sub.i768
+  %mul2.i738 = fmul double undef, %sub10.i773
+  %mul.i727 = fmul double undef, %mul.i736
+  %mul2.i729 = fmul double undef, %mul2.i738
+  %add.i716 = fadd double undef, %mul.i727
+  %add4.i719 = fadd double undef, %mul2.i729
+  %add.i695 = fadd double undef, %add.i716
+  %add4.i698 = fadd double undef, %add4.i719
+  %mul.i.i679 = fmul double undef, %add.i695
+  %mul2.i.i680 = fmul double undef, %add4.i698
+  %agg.tmp74663.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
+  store double %mul.i.i679, double* %agg.tmp74663.sroa.0.0.idx, align 8
+  %agg.tmp74663.sroa.1.8.idx943 = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 1
+  store double %mul2.i.i680, double* %agg.tmp74663.sroa.1.8.idx943, align 8
+  br label %return
+
+if.then78:                                        ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.then78, %if.then38
+  ret void
+}
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
new file mode 100644
index 0000000000000..06c4b524ee958
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; int foo(double *A, float *B, int g) {
+;   float B0 = B[0];
+;   float B1 = B[1]; <----- BasicBlock #1
+;   B0 += 5;
+;   B1 += 8;
+;
+;   if (g) bar();
+;
+;   A[0] += B0;     <------- BasicBlock #3
+;   A[1] += B1;
+; }
+
+
+;CHECK-LABEL: @foo(
+;CHECK: load <2 x float>
+;CHECK: fadd <2 x float>
+;CHECK: call i32
+;CHECK: load <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: store <2 x double>
+;CHECK: ret
+define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) {
+entry:
+  %0 = load float* %B, align 4
+  %arrayidx1 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %add = fadd float %0, 5.000000e+00
+  %add2 = fadd float %1, 8.000000e+00
+  %tobool = icmp eq i32 %g, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %call = tail call i32 (...)* @bar()
+  br label %if.end
+
+if.end:
+  %conv = fpext float %add to double
+  %2 = load double* %A, align 8
+  %add4 = fadd double %conv, %2
+  store double %add4, double* %A, align 8
+  %conv5 = fpext float %add2 to double
+  %arrayidx6 = getelementptr inbounds double* %A, i64 1
+  %3 = load double* %arrayidx6, align 8
+  %add7 = fadd double %conv5, %3
+  store double %add7, double* %arrayidx6, align 8
+  ret i32 undef
+}
+
+declare i32 @bar(...)
diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll
new file mode 100644
index 0000000000000..bbfd6f28ea974
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -0,0 +1,219 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+;int test(double *G) {
+;  G[0] = 1+G[5]*4;
+;  G[1] = 6+G[6]*3;
+;  G[2] = 7+G[5]*4;
+;  G[3] = 8+G[6]*4;
+;}
+
+;CHECK-LABEL: @test(
+;CHECK: load <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: store <2 x double>
+;CHECK: insertelement <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: store <2 x double>
+;CHECK: ret i32
+
+define i32 @test(double* nocapture %G) {
+entry:
+  %arrayidx = getelementptr inbounds double* %G, i64 5
+  %0 = load double* %arrayidx, align 8
+  %mul = fmul double %0, 4.000000e+00
+  %add = fadd double %mul, 1.000000e+00
+  store double %add, double* %G, align 8
+  %arrayidx2 = getelementptr inbounds double* %G, i64 6
+  %1 = load double* %arrayidx2, align 8
+  %mul3 = fmul double %1, 3.000000e+00
+  %add4 = fadd double %mul3, 6.000000e+00
+  %arrayidx5 = getelementptr inbounds double* %G, i64 1
+  store double %add4, double* %arrayidx5, align 8
+  %add8 = fadd double %mul, 7.000000e+00
+  %arrayidx9 = getelementptr inbounds double* %G, i64 2
+  store double %add8, double* %arrayidx9, align 8
+  %mul11 = fmul double %1, 4.000000e+00
+  %add12 = fadd double %mul11, 8.000000e+00
+  %arrayidx13 = getelementptr inbounds double* %G, i64 3
+  store double %add12, double* %arrayidx13, align 8
+  ret i32 undef
+}
+
+;int foo(double *A, int n) {
+;  A[0] = A[0] * 7.9 * n + 6.0;
+;  A[1] = A[1] * 7.7 * n + 2.0;
+;  A[2] = A[2] * 7.6 * n + 3.0;
+;  A[3] = A[3] * 7.4 * n + 4.0;
+;}
+;CHECK-LABEL: @foo(
+;CHECK: insertelement <2 x double>
+;CHECK: insertelement <2 x double>
+;CHECK-NOT: insertelement <2 x double>
+;CHECK: ret
+define i32 @foo(double* nocapture %A, i32 %n) {
+entry:
+  %0 = load double* %A, align 8
+  %mul = fmul double %0, 7.900000e+00
+  %conv = sitofp i32 %n to double
+  %mul1 = fmul double %conv, %mul
+  %add = fadd double %mul1, 6.000000e+00
+  store double %add, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double* %A, i64 1
+  %1 = load double* %arrayidx3, align 8
+  %mul4 = fmul double %1, 7.700000e+00
+  %mul6 = fmul double %conv, %mul4
+  %add7 = fadd double %mul6, 2.000000e+00
+  store double %add7, double* %arrayidx3, align 8
+  %arrayidx9 = getelementptr inbounds double* %A, i64 2
+  %2 = load double* %arrayidx9, align 8
+  %mul10 = fmul double %2, 7.600000e+00
+  %mul12 = fmul double %conv, %mul10
+  %add13 = fadd double %mul12, 3.000000e+00
+  store double %add13, double* %arrayidx9, align 8
+  %arrayidx15 = getelementptr inbounds double* %A, i64 3
+  %3 = load double* %arrayidx15, align 8
+  %mul16 = fmul double %3, 7.400000e+00
+  %mul18 = fmul double %conv, %mul16
+  %add19 = fadd double %mul18, 4.000000e+00
+  store double %add19, double* %arrayidx15, align 8
+  ret i32 undef
+}
+
+; int test2(double *G, int k) {
+;   if (k) {
+;     G[0] = 1+G[5]*4;
+;     G[1] = 6+G[6]*3;
+;   } else {
+;     G[2] = 7+G[5]*4;
+;     G[3] = 8+G[6]*3;
+;   }
+; }
+
+; We can't merge the gather sequences because one does not dominate the other.
+; CHECK: test2
+; CHECK: insertelement
+; CHECK: insertelement
+; CHECK: insertelement
+; CHECK: insertelement
+; CHECK: ret
+define i32 @test2(double* nocapture %G, i32 %k) {
+  %1 = icmp eq i32 %k, 0
+  %2 = getelementptr inbounds double* %G, i64 5
+  %3 = load double* %2, align 8
+  %4 = fmul double %3, 4.000000e+00
+  br i1 %1, label %12, label %5
+
+; <label>:5                                       ; preds = %0
+  %6 = fadd double %4, 1.000000e+00
+  store double %6, double* %G, align 8
+  %7 = getelementptr inbounds double* %G, i64 6
+  %8 = load double* %7, align 8
+  %9 = fmul double %8, 3.000000e+00
+  %10 = fadd double %9, 6.000000e+00
+  %11 = getelementptr inbounds double* %G, i64 1
+  store double %10, double* %11, align 8
+  br label %20
+
+; <label>:12                                      ; preds = %0
+  %13 = fadd double %4, 7.000000e+00
+  %14 = getelementptr inbounds double* %G, i64 2
+  store double %13, double* %14, align 8
+  %15 = getelementptr inbounds double* %G, i64 6
+  %16 = load double* %15, align 8
+  %17 = fmul double %16, 3.000000e+00
+  %18 = fadd double %17, 8.000000e+00
+  %19 = getelementptr inbounds double* %G, i64 3
+  store double %18, double* %19, align 8
+  br label %20
+
+; <label>:20                                      ; preds = %12, %5
+  ret i32 undef
+}
+
+
+;int foo(double *A, int n) {
+;  A[0] = A[0] * 7.9 * n + 6.0;
+;  A[1] = A[1] * 7.9 * n + 6.0;
+;  A[2] = A[2] * 7.9 * n + 6.0;
+;  A[3] = A[3] * 7.9 * n + 6.0;
+;}
+;CHECK-LABEL: @foo4(
+;CHECK: insertelement <2 x double>
+;CHECK: insertelement <2 x double>
+;CHECK-NOT: insertelement <2 x double>
+;CHECK: ret
+define i32 @foo4(double* nocapture %A, i32 %n) {
+entry:
+  %0 = load double* %A, align 8
+  %mul = fmul double %0, 7.900000e+00
+  %conv = sitofp i32 %n to double
+  %mul1 = fmul double %conv, %mul
+  %add = fadd double %mul1, 6.000000e+00
+  store double %add, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double* %A, i64 1
+  %1 = load double* %arrayidx3, align 8
+  %mul4 = fmul double %1, 7.900000e+00
+  %mul6 = fmul double %conv, %mul4
+  %add7 = fadd double %mul6, 6.000000e+00
+  store double %add7, double* %arrayidx3, align 8
+  %arrayidx9 = getelementptr inbounds double* %A, i64 2
+  %2 = load double* %arrayidx9, align 8
+  %mul10 = fmul double %2, 7.900000e+00
+  %mul12 = fmul double %conv, %mul10
+  %add13 = fadd double %mul12, 6.000000e+00
+  store double %add13, double* %arrayidx9, align 8
+  %arrayidx15 = getelementptr inbounds double* %A, i64 3
+  %3 = load double* %arrayidx15, align 8
+  %mul16 = fmul double %3, 7.900000e+00
+  %mul18 = fmul double %conv, %mul16
+  %add19 = fadd double %mul18, 6.000000e+00
+  store double %add19, double* %arrayidx15, align 8
+  ret i32 undef
+}
+
+;int partial_mrg(double *A, int n) {
+;  A[0] = A[0] * n;
+;  A[1] = A[1] * n;
+;  if (n < 4) return 0;
+;  A[2] = A[2] * n;
+;  A[3] = A[3] * (n+4);
+;}
+;CHECK-LABEL: @partial_mrg(
+;CHECK: insertelement <2 x double>
+;CHECK: insertelement <2 x double>
+;CHECK: insertelement <2 x double>
+;CHECK-NOT: insertelement <2 x double>
+;CHECK: ret
+define i32 @partial_mrg(double* nocapture %A, i32 %n) {
+entry:
+  %0 = load double* %A, align 8
+  %conv = sitofp i32 %n to double
+  %mul = fmul double %conv, %0
+  store double %mul, double* %A, align 8
+  %arrayidx2 = getelementptr inbounds double* %A, i64 1
+  %1 = load double* %arrayidx2, align 8
+  %mul4 = fmul double %conv, %1
+  store double %mul4, double* %arrayidx2, align 8
+  %cmp = icmp slt i32 %n, 4
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %arrayidx7 = getelementptr inbounds double* %A, i64 2
+  %2 = load double* %arrayidx7, align 8
+  %mul9 = fmul double %conv, %2
+  store double %mul9, double* %arrayidx7, align 8
+  %arrayidx11 = getelementptr inbounds double* %A, i64 3
+  %3 = load double* %arrayidx11, align 8
+  %add = add nsw i32 %n, 4
+  %conv12 = sitofp i32 %add to double
+  %mul13 = fmul double %conv12, %3
+  store double %mul13, double* %arrayidx11, align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret i32 0
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
new file mode 100644
index 0000000000000..fba35499fb7d5
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; int foo(int *A) {
+;   int r = A[0], g = A[1], b = A[2], a = A[3];
+;   for (int i=0; i < A[13]; i++) {
+;     r*=18; g*=19; b*=12; a *=9;
+;   }
+;   A[0] = r; A[1] = g; A[2] = b; A[3] = a;
+; }
+
+;CHECK-LABEL: @foo
+;CHECK: bitcast i32* %A to <4 x i32>*
+;CHECK-NEXT: load <4 x i32>
+;CHECK: phi <4 x i32>
+;CHECK-NEXT: mul <4 x i32>
+;CHECK-NOT: mul
+;CHECK: phi <4 x i32>
+;CHECK: bitcast i32* %A to <4 x i32>*
+;CHECK-NEXT: store <4 x i32>
+;CHECK-NEXT:ret i32 undef
+define i32 @foo(i32* nocapture %A) #0 {
+entry:
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 1
+  %1 = load i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 2
+  %2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 3
+  %3 = load i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 13
+  %4 = load i32* %arrayidx4, align 4
+  %cmp24 = icmp sgt i32 %4, 0
+  br i1 %cmp24, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.029 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %a.028 = phi i32 [ %mul7, %for.body ], [ %3, %entry ]
+  %b.027 = phi i32 [ %mul6, %for.body ], [ %2, %entry ]
+  %g.026 = phi i32 [ %mul5, %for.body ], [ %1, %entry ]
+  %r.025 = phi i32 [ %mul, %for.body ], [ %0, %entry ]
+  %mul = mul nsw i32 %r.025, 18
+  %mul5 = mul nsw i32 %g.026, 19
+  %mul6 = mul nsw i32 %b.027, 12
+  %mul7 = mul nsw i32 %a.028, 9
+  %inc = add nsw i32 %i.029, 1
+  %cmp = icmp slt i32 %inc, %4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  %a.0.lcssa = phi i32 [ %3, %entry ], [ %mul7, %for.body ]
+  %b.0.lcssa = phi i32 [ %2, %entry ], [ %mul6, %for.body ]
+  %g.0.lcssa = phi i32 [ %1, %entry ], [ %mul5, %for.body ]
+  %r.0.lcssa = phi i32 [ %0, %entry ], [ %mul, %for.body ]
+  store i32 %r.0.lcssa, i32* %A, align 4
+  store i32 %g.0.lcssa, i32* %arrayidx1, align 4
+  store i32 %b.0.lcssa, i32* %arrayidx2, align 4
+  store i32 %a.0.lcssa, i32* %arrayidx3, align 4
+  ret i32 undef
+}
+
+
diff --git a/test/Transforms/SLPVectorizer/X86/debug_info.ll b/test/Transforms/SLPVectorizer/X86/debug_info.ll
new file mode 100644
index 0000000000000..f4e68f217f254
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; int depth(double *A, int m) {
+;   double y0 = 0; double y1 = 1;
+;   for (int i=0; i < m; i++) {
+;     y0 = A[4];
+;     y1 = A[5];
+;   }
+;   A[8] = y0; A[8+1] = y1;
+; }
+
+;CHECK: @depth
+;CHECK: getelementptr inbounds {{.*}}, !dbg ![[LOC:[0-9]+]]
+;CHECK: bitcast double* {{.*}}, !dbg ![[LOC]]
+;CHECK: load <2 x double>* {{.*}}, !dbg ![[LOC]]
+;CHECK: store <2 x double> {{.*}}, !dbg ![[LOC2:[0-9]+]]
+;CHECK: ret
+;CHECK: ![[LOC]] = metadata !{i32 4, i32 0,
+;CHECK: ![[LOC2]] = metadata !{i32 7, i32 0,
+
+define i32 @depth(double* nocapture %A, i32 %m) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{double* %A}, i64 0, metadata !12), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32 %m}, i64 0, metadata !13), !dbg !19
+  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !14), !dbg !21
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !15), !dbg !21
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16), !dbg !23
+  %cmp8 = icmp sgt i32 %m, 0, !dbg !23
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end, !dbg !23
+
+for.body.lr.ph:                                   ; preds = %entry
+  %arrayidx = getelementptr inbounds double* %A, i64 4, !dbg !24
+  %0 = load double* %arrayidx, align 8, !dbg !24
+  %arrayidx1 = getelementptr inbounds double* %A, i64 5, !dbg !29
+  %1 = load double* %arrayidx1, align 8, !dbg !29
+  br label %for.end, !dbg !23
+
+for.end:                                          ; preds = %for.body.lr.ph, %entry
+  %y1.0.lcssa = phi double [ %1, %for.body.lr.ph ], [ 1.000000e+00, %entry ]
+  %y0.0.lcssa = phi double [ %0, %for.body.lr.ph ], [ 0.000000e+00, %entry ]
+  %arrayidx2 = getelementptr inbounds double* %A, i64 8, !dbg !30
+  store double %y0.0.lcssa, double* %arrayidx2, align 8, !dbg !30
+  %arrayidx3 = getelementptr inbounds double* %A, i64 9, !dbg !30
+  store double %y1.0.lcssa, double* %arrayidx3, align 8, !dbg !30
+  ret i32 undef, !dbg !31
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !32}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/nadav/file.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"file.c", metadata !"/Users/nadav"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"depth", metadata !"depth", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (double*, i32)* @depth, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [depth]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/nadav/file.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!10 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!11 = metadata !{metadata !12, metadata !13, metadata !14, metadata !15, metadata !16}
+!12 = metadata !{i32 786689, metadata !4, metadata !"A", metadata !5, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [A] [line 1]
+!13 = metadata !{i32 786689, metadata !4, metadata !"m", metadata !5, i32 33554433, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [m] [line 1]
+!14 = metadata !{i32 786688, metadata !4, metadata !"y0", metadata !5, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [y0] [line 2]
+!15 = metadata !{i32 786688, metadata !4, metadata !"y1", metadata !5, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [y1] [line 2]
+!16 = metadata !{i32 786688, metadata !17, metadata !"i", metadata !5, i32 3, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3]
+!17 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!19 = metadata !{i32 1, i32 0, metadata !4, null}
+!20 = metadata !{double 0.000000e+00}
+!21 = metadata !{i32 2, i32 0, metadata !4, null}
+!22 = metadata !{double 1.000000e+00}
+!23 = metadata !{i32 3, i32 0, metadata !17, null}
+!24 = metadata !{i32 4, i32 0, metadata !25, null}
+!25 = metadata !{i32 786443, metadata !1, metadata !17, i32 3, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
+!29 = metadata !{i32 5, i32 0, metadata !25, null}
+!30 = metadata !{i32 7, i32 0, metadata !4, null}
+!31 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll
index 8e85cb6c9b8f8..5135a92a7bdbd 100644
--- a/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;   return 0;
 ; }
 
-; CHECK: @foo
+; CHECK-LABEL: @foo(
 ; CHECK: load <4 x i32>
 ; CHECK: mul <4 x i32>
 ; CHECK: store <4 x i32>
@@ -41,7 +41,7 @@ entry:
 }
 
 
-; int foo_fail(int * restrict B,  int * restrict A, int n, int m) {
+; int extr_user(int * restrict B,  int * restrict A, int n, int m) {
 ;   B[0] = n * A[0] + m * A[0];
 ;   B[1] = n * A[1] + m * A[1];
 ;   B[2] = n * A[2] + m * A[2];
@@ -49,10 +49,12 @@ entry:
 ;   return A[0];
 ; }
 
-; CHECK: @foo_fail
-; CHECK-NOT: load <4 x i32>
-; CHECK: ret
-define i32 @foo_fail(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @extr_user(
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK-NEXT: ret
+define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
 entry:
   %0 = load i32* %A, align 4
   %mul238 = add i32 %m, %n
@@ -73,6 +75,35 @@ entry:
   %add20 = mul i32 %3, %mul238
   %arrayidx21 = getelementptr inbounds i32* %B, i64 3
   store i32 %add20, i32* %arrayidx21, align 4
-  ret i32 %0  ;<--------- This value has multiple users and can't be vectorized.
+  ret i32 %0  ;<--------- This value has multiple users
 }
 
+; In this example we have an external user that is not the first element in the vector.
+; CHECK-LABEL: @extr_user1(
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: extractelement <4 x i32>
+; CHECK-NEXT: ret
+define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
+entry:
+  %0 = load i32* %A, align 4
+  %mul238 = add i32 %m, %n
+  %add = mul i32 %0, %mul238
+  store i32 %add, i32* %B, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 1
+  %1 = load i32* %arrayidx4, align 4
+  %add8 = mul i32 %1, %mul238
+  %arrayidx9 = getelementptr inbounds i32* %B, i64 1
+  store i32 %add8, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %A, i64 2
+  %2 = load i32* %arrayidx10, align 4
+  %add14 = mul i32 %2, %mul238
+  %arrayidx15 = getelementptr inbounds i32* %B, i64 2
+  store i32 %add14, i32* %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 3
+  %3 = load i32* %arrayidx16, align 4
+  %add20 = mul i32 %3, %mul238
+  %arrayidx21 = getelementptr inbounds i32* %B, i64 3
+  store i32 %add20, i32* %arrayidx21, align 4
+  ret i32 %1  ;<--------- This value has multiple users
+}
diff --git a/test/Transforms/SLPVectorizer/X86/external_user.ll b/test/Transforms/SLPVectorizer/X86/external_user.ll
new file mode 100644
index 0000000000000..6d09aa61bf350
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/external_user.ll
@@ -0,0 +1,96 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; double foo(double * restrict b,  double * restrict a, int n, int m) {
+;   double r=a[1];
+;   double g=a[0];
+;   double x;
+;   for (int i=0; i < 100; i++) {
+;     r += 10;
+;     g += 10;
+;     r *= 4;
+;     g *= 4;
+;     x = g; <----- external user!
+;     r += 4;
+;     g += 4;
+;   }
+;   b[0] = g;
+;   b[1] = r;
+;
+;   return x; <-- must extract here!
+; }
+
+;CHECK: ext_user
+;CHECK: phi <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: br
+;CHECK: store <2 x double>
+;CHECK: extractelement <2 x double>
+;CHECK: ret double
+
+define double @ext_user(double* noalias nocapture %B, double* noalias nocapture %A, i32 %n, i32 %m) {
+entry:
+  %arrayidx = getelementptr inbounds double* %A, i64 1
+  %0 = load double* %arrayidx, align 8
+  %1 = load double* %A, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.020 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %G.019 = phi double [ %1, %entry ], [ %add5, %for.body ]
+  %R.018 = phi double [ %0, %entry ], [ %add4, %for.body ]
+  %add = fadd double %R.018, 1.000000e+01
+  %add2 = fadd double %G.019, 1.000000e+01
+  %mul = fmul double %add, 4.000000e+00
+  %mul3 = fmul double %add2, 4.000000e+00
+  %add4 = fadd double %mul, 4.000000e+00
+  %add5 = fadd double %mul3, 4.000000e+00
+  %inc = add nsw i32 %i.020, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store double %add5, double* %B, align 8
+  %arrayidx7 = getelementptr inbounds double* %B, i64 1
+  store double %add4, double* %arrayidx7, align 8
+  ret double %mul3
+}
+
+; A need-to-gather entry cannot be an external use of the scalar element.
+; Instead the insertelement instructions of the need-to-gather entry are the
+; external users.
+; This test would assert because we would keep the scalar fpext and fadd alive.
+; PR18129
+
+; CHECK-LABEL: needtogather
+define i32 @needtogather(double *noalias %a, i32 *noalias %b,  float * noalias %c,
+                i32 * noalias %d) {
+entry:
+  %0 = load i32* %d, align 4
+  %conv = sitofp i32 %0 to float
+  %1 = load float* %c
+  %sub = fsub float 0.000000e+00, %1
+  %mul = fmul float %sub, 0.000000e+00
+  %add = fadd float %conv, %mul
+  %conv1 = fpext float %add to double
+  %sub3 = fsub float 1.000000e+00, %1
+  %mul4 = fmul float %sub3, 0.000000e+00
+  %add5 = fadd float %conv, %mul4
+  %conv6 = fpext float %add5 to double
+  %tobool = fcmp une float %add, 0.000000e+00
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  br label %if.end
+
+if.end:
+  %storemerge = phi double [ %conv6, %if.then ], [ %conv1, %entry ]
+  %e.0 = phi double [ %conv1, %if.then ], [ %conv6, %entry ]
+  store double %storemerge, double* %a, align 8
+  %conv7 = fptosi double %e.0 to i32
+  store i32 %conv7, i32* %b, align 4
+  ret i32 undef
+}
diff --git a/test/Transforms/SLPVectorizer/X86/extract.ll b/test/Transforms/SLPVectorizer/X86/extract.ll
new file mode 100644
index 0000000000000..f611fd4ec24f9
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/extract.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK: fextr
+;CHECK-NOT: insertelement
+;CHECK-NOT: extractelement
+;CHECK: fadd <2 x double>
+;CHECK: ret void
+define void @fextr(double* %ptr) {
+entry:
+  %LD = load <2 x double>* undef
+  %V0 = extractelement <2 x double> %LD, i32 0
+  %V1 = extractelement <2 x double> %LD, i32 1
+  %P0 = getelementptr inbounds double* %ptr, i64 0
+  %P1 = getelementptr inbounds double* %ptr, i64 1
+  %A0 = fadd double %V0, 0.0
+  %A1 = fadd double %V1, 1.1
+  store double %A0, double* %P0, align 4
+  store double %A1, double* %P1, align 4
+  ret void
+}
+
+;CHECK: fextr1
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @fextr1(double* %ptr) {
+entry:
+  %LD = load <2 x double>* undef
+  %V0 = extractelement <2 x double> %LD, i32 0
+  %V1 = extractelement <2 x double> %LD, i32 1
+  %P0 = getelementptr inbounds double* %ptr, i64 1  ; <--- incorrect order
+  %P1 = getelementptr inbounds double* %ptr, i64 0
+  %A0 = fadd double %V0, 1.2
+  %A1 = fadd double %V1, 3.4
+  store double %A0, double* %P0, align 4
+  store double %A1, double* %P1, align 4
+  ret void
+}
+
+;CHECK: fextr2
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @fextr2(double* %ptr) {
+entry:
+  %LD = load <4 x double>* undef
+  %V0 = extractelement <4 x double> %LD, i32 0  ; <--- invalid size.
+  %V1 = extractelement <4 x double> %LD, i32 1
+  %P0 = getelementptr inbounds double* %ptr, i64 0
+  %P1 = getelementptr inbounds double* %ptr, i64 1
+  %A0 = fadd double %V0, 5.5
+  %A1 = fadd double %V1, 6.6
+  store double %A0, double* %P0, align 4
+  store double %A1, double* %P1, align 4
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
new file mode 100644
index 0000000000000..8f919512ff8da
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -0,0 +1,417 @@
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; #include <stdint.h>
+;
+; int foo(float *A, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += 7*A[i*4  ] +
+;            7*A[i*4+1] +
+;            7*A[i*4+2] +
+;            7*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; NOSTORE-LABEL: add_red
+; NOSTORE: fmul <4 x float>
+; NOSTORE: shufflevector <4 x float>
+
+define i32 @add_red(float* %A, i32 %n) {
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
+  %mul = shl nsw i64 %i.033, 2
+  %arrayidx = getelementptr inbounds float* %A, i64 %mul
+  %1 = load float* %arrayidx, align 4
+  %mul2 = fmul float %1, 7.000000e+00
+  %add28 = or i64 %mul, 1
+  %arrayidx4 = getelementptr inbounds float* %A, i64 %add28
+  %2 = load float* %arrayidx4, align 4
+  %mul5 = fmul float %2, 7.000000e+00
+  %add6 = fadd fast float %mul2, %mul5
+  %add829 = or i64 %mul, 2
+  %arrayidx9 = getelementptr inbounds float* %A, i64 %add829
+  %3 = load float* %arrayidx9, align 4
+  %mul10 = fmul float %3, 7.000000e+00
+  %add11 = fadd fast float %add6, %mul10
+  %add1330 = or i64 %mul, 3
+  %arrayidx14 = getelementptr inbounds float* %A, i64 %add1330
+  %4 = load float* %arrayidx14, align 4
+  %mul15 = fmul float %4, 7.000000e+00
+  %add16 = fadd fast float %add11, %mul15
+  %add17 = fadd fast float %sum.032, %add16
+  %inc = add nsw i64 %i.033, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add17 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum *= B[0]*A[i*4  ] +
+;       B[1]*A[i*4+1] +
+;       B[2]*A[i*4+2] +
+;       B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: mul_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp38 = icmp sgt i32 %n, 0
+  br i1 %cmp38, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx15, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
+  %mul = shl nsw i64 %i.040, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %5 = load float* %arrayidx2, align 4
+  %mul3 = fmul float %0, %5
+  %add35 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add35
+  %6 = load float* %arrayidx6, align 4
+  %mul7 = fmul float %1, %6
+  %add8 = fadd fast float %mul3, %mul7
+  %add1136 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add1136
+  %7 = load float* %arrayidx12, align 4
+  %mul13 = fmul float %2, %7
+  %add14 = fadd fast float %add8, %mul13
+  %add1737 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add1737
+  %8 = load float* %arrayidx18, align 4
+  %mul19 = fmul float %3, %8
+  %add20 = fadd fast float %add14, %mul19
+  %mul21 = fmul float %sum.039, %add20
+  %inc = add nsw i64 %i.040, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %mul21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*6  ] +
+;            B[1]*A[i*6+1] +
+;            B[2]*A[i*6+2] +
+;            B[3]*A[i*6+3] +
+;            B[4]*A[i*6+4] +
+;            B[5]*A[i*6+5] +
+;            B[6]*A[i*6+6] +
+;            B[7]*A[i*6+7] +
+;            B[8]*A[i*6+8];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: long_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp81 = icmp sgt i32 %n, 0
+  br i1 %cmp81, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx15, align 4
+  %arrayidx21 = getelementptr inbounds float* %B, i64 4
+  %4 = load float* %arrayidx21, align 4
+  %arrayidx27 = getelementptr inbounds float* %B, i64 5
+  %5 = load float* %arrayidx27, align 4
+  %arrayidx33 = getelementptr inbounds float* %B, i64 6
+  %6 = load float* %arrayidx33, align 4
+  %arrayidx39 = getelementptr inbounds float* %B, i64 7
+  %7 = load float* %arrayidx39, align 4
+  %arrayidx45 = getelementptr inbounds float* %B, i64 8
+  %8 = load float* %arrayidx45, align 4
+  %9 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
+  %mul = mul nsw i64 %i.083, 6
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %10 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %10
+  %add80 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add80
+  %11 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %1, %11
+  %add8 = fadd fast float %mul3, %mul7
+  %add11 = add nsw i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add11
+  %12 = load float* %arrayidx12, align 4
+  %mul13 = fmul fast float %2, %12
+  %add14 = fadd fast float %add8, %mul13
+  %add17 = add nsw i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add17
+  %13 = load float* %arrayidx18, align 4
+  %mul19 = fmul fast float %3, %13
+  %add20 = fadd fast float %add14, %mul19
+  %add23 = add nsw i64 %mul, 4
+  %arrayidx24 = getelementptr inbounds float* %A, i64 %add23
+  %14 = load float* %arrayidx24, align 4
+  %mul25 = fmul fast float %4, %14
+  %add26 = fadd fast float %add20, %mul25
+  %add29 = add nsw i64 %mul, 5
+  %arrayidx30 = getelementptr inbounds float* %A, i64 %add29
+  %15 = load float* %arrayidx30, align 4
+  %mul31 = fmul fast float %5, %15
+  %add32 = fadd fast float %add26, %mul31
+  %add35 = add nsw i64 %mul, 6
+  %arrayidx36 = getelementptr inbounds float* %A, i64 %add35
+  %16 = load float* %arrayidx36, align 4
+  %mul37 = fmul fast float %6, %16
+  %add38 = fadd fast float %add32, %mul37
+  %add41 = add nsw i64 %mul, 7
+  %arrayidx42 = getelementptr inbounds float* %A, i64 %add41
+  %17 = load float* %arrayidx42, align 4
+  %mul43 = fmul fast float %7, %17
+  %add44 = fadd fast float %add38, %mul43
+  %add47 = add nsw i64 %mul, 8
+  %arrayidx48 = getelementptr inbounds float* %A, i64 %add47
+  %18 = load float* %arrayidx48, align 4
+  %mul49 = fmul fast float %8, %18
+  %add50 = fadd fast float %add44, %mul49
+  %add51 = fadd fast float %sum.082, %add50
+  %inc = add nsw i64 %i.083, 1
+  %exitcond = icmp eq i64 %inc, %9
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add51 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*4  ];
+;     sum += B[1]*A[i*4+1];
+;     sum += B[2]*A[i*4+2];
+;     sum += B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: chain_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp41 = icmp sgt i32 %n, 0
+  br i1 %cmp41, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx10 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx10, align 4
+  %arrayidx16 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx16, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
+  %mul = shl nsw i64 %i.043, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %5 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %5
+  %add = fadd fast float %sum.042, %mul3
+  %add638 = or i64 %mul, 1
+  %arrayidx7 = getelementptr inbounds float* %A, i64 %add638
+  %6 = load float* %arrayidx7, align 4
+  %mul8 = fmul fast float %1, %6
+  %add9 = fadd fast float %add, %mul8
+  %add1239 = or i64 %mul, 2
+  %arrayidx13 = getelementptr inbounds float* %A, i64 %add1239
+  %7 = load float* %arrayidx13, align 4
+  %mul14 = fmul fast float %2, %7
+  %add15 = fadd fast float %add9, %mul14
+  %add1840 = or i64 %mul, 3
+  %arrayidx19 = getelementptr inbounds float* %A, i64 %add1840
+  %8 = load float* %arrayidx19, align 4
+  %mul20 = fmul fast float %3, %8
+  %add21 = fadd fast float %add15, %mul20
+  %inc = add nsw i64 %i.043, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] +
+;          B[1] *A[i*4+1] +
+;          B[2] *A[i*4+2] +
+;          B[3] *A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: store_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
+entry:
+  %cmp37 = icmp sgt i32 %n, 0
+  br i1 %cmp37, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %1 = load float* %B, align 4
+  %mul = shl nsw i64 %i.039, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %2 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %1, %2
+  %3 = load float* %arrayidx4, align 4
+  %add34 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add34
+  %4 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %3, %4
+  %add8 = fadd fast float %mul3, %mul7
+  %5 = load float* %arrayidx9, align 4
+  %add1135 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add1135
+  %6 = load float* %arrayidx12, align 4
+  %mul13 = fmul fast float %5, %6
+  %add14 = fadd fast float %add8, %mul13
+  %7 = load float* %arrayidx15, align 4
+  %add1736 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add1736
+  %8 = load float* %arrayidx18, align 4
+  %mul19 = fmul fast float %7, %8
+  %add20 = fadd fast float %add14, %mul19
+  store float %add20, float* %C.addr.038, align 4
+  %incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1
+  %inc = add nsw i64 %i.039, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+; STORE-LABEL: store_red_double
+; STORE: fmul <2 x double>
+; STORE: extractelement <2 x double>
+; STORE: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double* %B, i64 1
+  %1 = load double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double* %A, i64 %mul
+  %3 = load double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double* %A, i64 %add16
+  %4 = load double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/implicitfloat.ll b/test/Transforms/SLPVectorizer/X86/implicitfloat.ll
new file mode 100644
index 0000000000000..f63f2683b10ef
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/implicitfloat.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't vectorize when noimplicitfloat is used.
+; CHECK: test1
+; CHECK-NOT: store <2 x double>
+; CHECK: ret
+define void @test1(double* %a, double* %b, double* %c) noimplicitfloat { ; <------ noimplicitfloat attribute here!
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
new file mode 100644
index 0000000000000..3115232887bd3
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1
+
+; We can't vectorize when the roots are used inside the tree.
+;CHECK-LABEL: @in_tree_user(
+;CHECK-NOT: load <2 x double>
+;CHECK: ret
+define void @in_tree_user(double* nocapture %A, i32 %n) {
+entry:
+  %conv = sitofp i32 %n to double
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds double* %A, i64 %0
+  %1 = load double* %arrayidx, align 8
+  %mul1 = fmul double %conv, %1
+  %mul2 = fmul double %mul1, 7.000000e+00
+  %add = fadd double %mul2, 5.000000e+00
+  %BadValue = fadd double %add, %add    ; <------------------ In tree user.
+  %2 = or i64 %0, 1
+  %arrayidx6 = getelementptr inbounds double* %A, i64 %2
+  %3 = load double* %arrayidx6, align 8
+  %mul8 = fmul double %conv, %3
+  %mul9 = fmul double %mul8, 4.000000e+00
+  %add10 = fadd double %mul9, 9.000000e+00
+  %cmp11 = fcmp ogt double %add, %add10
+  br i1 %cmp11, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...)
+
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
new file mode 100644
index 0000000000000..43f7aed9f5195
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -0,0 +1,197 @@
+; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
+
+target triple = "x86_64-apple-macosx10.8.0"
+
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select(
+; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
+; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+; Insert in an order different from the vector indices to make sure it
+; doesn't matter
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_insert_out_of_order(
+; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
+; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 2
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 0
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+declare void @v4f32_user(<4 x float>) #0
+declare void @f32_user(float) #0
+
+; Multiple users of the final constructed vector
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_users(
+; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
+; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  call void @v4f32_user(<4 x float> %rd) #0
+  ret <4 x float> %rd
+}
+
+; Unused insertelement
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_no_users(
+; CHECK-NOT: icmp ne <4 x i32>
+; CHECK-NOT: select <4 x i1>
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> undef, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+; Make sure infinite loop doesn't happen which I ran into when trying
+; to do this backwards this backwards
+define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+; CHECK-LABEL: @reconstruct(
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
+  %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
+  %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
+  %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
+  ret <4 x i32> %rd
+}
+
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_v2(
+; CHECK: icmp ne <2 x i32>
+; CHECK: select <2 x i1>
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %a0 = extractelement <2 x float> %a, i32 0
+  %a1 = extractelement <2 x float> %a, i32 1
+  %b0 = extractelement <2 x float> %b, i32 0
+  %b1 = extractelement <2 x float> %b, i32 1
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %ra = insertelement <2 x float> undef, float %s0, i32 0
+  %rb = insertelement <2 x float> %ra, float %s1, i32 1
+  ret <2 x float> %rb
+}
+
+; Make sure when we construct partial vectors, we don't keep
+; re-visiting the insertelement chains starting with undef
+; (low cost threshold needed to force this to happen)
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
+  %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
+  %3 = icmp ne <2 x i32> %2, zeroinitializer
+  %4 = insertelement <2 x float> undef, float %a0, i32 0
+  %5 = insertelement <2 x float> %4, float %a1, i32 1
+  %6 = insertelement <2 x float> undef, float %b0, i32 0
+  %7 = insertelement <2 x float> %6, float %b1, i32 1
+  %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
+  %9 = extractelement <2 x float> %8, i32 0
+  %ra = insertelement <4 x float> undef, float %9, i32 0
+  %10 = extractelement <2 x float> %8, i32 1
+  %rb = insertelement <4 x float> %ra, float %10, i32 1
+  ret <4 x float> %rb
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/lit.local.cfg b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
index a8ad0f1a28b23..ba763cf03ffcc 100644
--- a/test/Transforms/SLPVectorizer/X86/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/X86/long_chains.ll b/test/Transforms/SLPVectorizer/X86/long_chains.ll
new file mode 100644
index 0000000000000..5af3e6d6e9038
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; At this point we can't vectorize only parts of the tree.
+
+; CHECK: test
+; CHECK: insertelement <2 x i8>
+; CHECK: insertelement <2 x i8>
+; CHECK: sitofp <2 x i8>
+; CHECK: fmul <2 x double>
+; CHECK: ret
+define i32 @test(double* nocapture %A, i8* nocapture %B) {
+entry:
+  %0 = load i8* %B, align 1
+  %arrayidx1 = getelementptr inbounds i8* %B, i64 1
+  %1 = load i8* %arrayidx1, align 1
+  %add = add i8 %0, 3
+  %add4 = add i8 %1, 3
+  %conv6 = sitofp i8 %add to double
+  %conv7 = sitofp i8 %add4 to double 
+  %mul = fmul double %conv6, %conv6
+  %add8 = fadd double %mul, 1.000000e+00
+  %mul9 = fmul double %conv7, %conv7
+  %add10 = fadd double %mul9, 1.000000e+00
+  %mul11 = fmul double %add8, %add8
+  %add12 = fadd double %mul11, 1.000000e+00
+  %mul13 = fmul double %add10, %add10
+  %add14 = fadd double %mul13, 1.000000e+00
+  %mul15 = fmul double %add12, %add12
+  %add16 = fadd double %mul15, 1.000000e+00
+  %mul17 = fmul double %add14, %add14
+  %add18 = fadd double %mul17, 1.000000e+00
+  %mul19 = fmul double %add16, %add16
+  %add20 = fadd double %mul19, 1.000000e+00
+  %mul21 = fmul double %add18, %add18
+  %add22 = fadd double %mul21, 1.000000e+00
+  %mul23 = fmul double %add20, %add20
+  %add24 = fadd double %mul23, 1.000000e+00
+  %mul25 = fmul double %add22, %add22
+  %add26 = fadd double %mul25, 1.000000e+00
+  store double %add24, double* %A, align 8
+  %arrayidx28 = getelementptr inbounds double* %A, i64 1
+  store double %add26, double* %arrayidx28, align 8
+  ret i32 undef
+}
diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
index 4a37fce2ff247..aef2479dd524c 100644
--- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-;CHECK: @foo
+;CHECK-LABEL: @foo(
 ;CHECK: load <4 x i32>
 ;CHECK: add <4 x i32>
 ;CHECK: store <4 x i32>
diff --git a/test/Transforms/SLPVectorizer/X86/multi_block.ll b/test/Transforms/SLPVectorizer/X86/multi_block.ll
new file mode 100644
index 0000000000000..2f1cc74d05ffd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/multi_block.ll
@@ -0,0 +1,55 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; int bar(double *A, int d) {
+;   double A0 = A[0];
+;   double A1 = A[1];
+;   float F0 = A0;
+;   float F1 = A1;
+;   if (d) foo(); <----- This splits the blocks
+;   F0+=4.0;
+;   F1+=5.0;
+;   A[8] = 9.0 + F0;
+;   A[9] = 5.0 + F1;
+; }
+
+
+;CHECK-LABEL: @bar(
+;CHECK: load <2 x double>
+;CHECK: fptrunc <2 x double>
+;CHECK: call i32
+;CHECK: fadd <2 x float>
+;CHECK: fpext <2 x float>
+;CHECK: store <2 x double>
+;CHECK: ret
+define i32 @bar(double* nocapture %A, i32 %d) {
+  %1 = load double* %A, align 8
+  %2 = getelementptr inbounds double* %A, i64 1
+  %3 = load double* %2, align 8
+  %4 = fptrunc double %1 to float
+  %5 = fptrunc double %3 to float
+  %6 = icmp eq i32 %d, 0
+  br i1 %6, label %9, label %7
+
+; <label>:7                                       ; preds = %0
+  %8 = tail call i32 (...)* @foo()
+  br label %9
+
+; <label>:9                                       ; preds = %0, %7
+  %10 = fadd float %4, 4.000000e+00
+  %11 = fadd float %5, 5.000000e+00
+  %12 = fpext float %10 to double
+  %13 = fadd double %12, 9.000000e+00
+  %14 = getelementptr inbounds double* %A, i64 8
+  store double %13, double* %14, align 8
+  %15 = fpext float %11 to double
+  %16 = fadd double %15, 5.000000e+00
+  %17 = getelementptr inbounds double* %A, i64 9
+  store double %16, double* %17, align 8
+  ret i32 undef
+}
+
+declare i32 @foo(...)
+
diff --git a/test/Transforms/SLPVectorizer/X86/multi_user.ll b/test/Transforms/SLPVectorizer/X86/multi_user.ll
index aaa6063fdeda3..cab99945e29eb 100644
--- a/test/Transforms/SLPVectorizer/X86/multi_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 ;  A[4] += n * 5 + 11;
 ;}
 
-;CHECK: @foo
+;CHECK-LABEL: @foo(
 ;CHECK: insertelement <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: add <4 x i32>
diff --git a/test/Transforms/SLPVectorizer/X86/odd_store.ll b/test/Transforms/SLPVectorizer/X86/odd_store.ll
new file mode 100644
index 0000000000000..027f6016e2b9d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;int foo(char * restrict A, float * restrict B, float T) {
+;  A[0] = (T * B[10] + 4.0);
+;  A[1] = (T * B[11] + 5.0);
+;  A[2] = (T * B[12] + 6.0);
+;}
+
+;CHECK-LABEL: @foo(
+;CHECK-NOT: load <3 x float>
+;CHECK-NOT: fmul <3 x float>
+;CHECK-NOT: fpext <3 x float>
+;CHECK-NOT: fadd <3 x double>
+;CHECK-NOT: fptosi <3 x double>
+;CHECK-NOT: store <3 x i8>
+;CHECK: ret
+define i32 @foo(i8* noalias nocapture %A, float* noalias nocapture %B, float %T) {
+  %1 = getelementptr inbounds float* %B, i64 10
+  %2 = load float* %1, align 4
+  %3 = fmul float %2, %T
+  %4 = fpext float %3 to double
+  %5 = fadd double %4, 4.000000e+00
+  %6 = fptosi double %5 to i8
+  store i8 %6, i8* %A, align 1
+  %7 = getelementptr inbounds float* %B, i64 11
+  %8 = load float* %7, align 4
+  %9 = fmul float %8, %T
+  %10 = fpext float %9 to double
+  %11 = fadd double %10, 5.000000e+00
+  %12 = fptosi double %11 to i8
+  %13 = getelementptr inbounds i8* %A, i64 1
+  store i8 %12, i8* %13, align 1
+  %14 = getelementptr inbounds float* %B, i64 12
+  %15 = load float* %14, align 4
+  %16 = fmul float %15, %T
+  %17 = fpext float %16 to double
+  %18 = fadd double %17, 6.000000e+00
+  %19 = fptosi double %18 to i8
+  %20 = getelementptr inbounds i8* %A, i64 2
+  store i8 %19, i8* %20, align 1
+  ret i32 undef
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/operandorder.ll b/test/Transforms/SLPVectorizer/X86/operandorder.ll
new file mode 100644
index 0000000000000..c5322a839ed1e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -0,0 +1,234 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+
+
+; Make sure we order the operands of commutative operations so that we get
+; bigger vectorizable trees.
+
+; CHECK-LABEL: shuffle_operands1
+; CHECK:         load <2 x double>
+; CHECK:         fadd <2 x double>
+
+define void @shuffle_operands1(double * noalias %from, double * noalias %to,
+                               double %v1, double %v2) {
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %v1
+  %v1_2 = fadd double %v2, %v0_2
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %p
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast2
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast2(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_2, %v0_1
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast3
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast3(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+
+; CHECK-LABEL: shuffle_preserve_broadcast4
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast4(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_2, %v0_1
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast5
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast5(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+
+; CHECK-LABEL: shuffle_preserve_broadcast6
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast6(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %v0_1, %p
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; Make sure we don't scramble operands when we reorder them and destroy
+; 'good' source order.
+
+; CHECK-LABEL: good_load_order
+
+; CHECK: %[[V1:[0-9]+]] = load <4 x float>*
+; CHECK: %[[V2:[0-9]+]] = insertelement <4 x float> undef, float %1, i32 0
+; CHECK: %[[V3:[0-9]+]] = shufflevector <4 x float> %[[V2]], <4 x float> %[[V1]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+; CHECK:                = fmul <4 x float> %[[V1]], %[[V3]]
+
+@a = common global [32000 x float] zeroinitializer, align 16
+
+define void @good_load_order() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %0 = load float* getelementptr inbounds ([32000 x float]* @a, i64 0, i64 0), align 16
+  br label %for.body3
+
+for.body3:
+  %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %2 = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %2
+  %3 = load float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %indvars.iv
+  %mul6 = fmul float %3, %1
+  store float %mul6, float* %arrayidx5, align 4
+  %4 = add nsw i64 %indvars.iv, 2
+  %arrayidx11 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %4
+  %5 = load float* %arrayidx11, align 4
+  %mul15 = fmul float %5, %3
+  store float %mul15, float* %arrayidx, align 4
+  %6 = add nsw i64 %indvars.iv, 3
+  %arrayidx21 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %6
+  %7 = load float* %arrayidx21, align 4
+  %mul25 = fmul float %7, %5
+  store float %mul25, float* %arrayidx11, align 4
+  %8 = add nsw i64 %indvars.iv, 4
+  %arrayidx31 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %8
+  %9 = load float* %arrayidx31, align 4
+  %mul35 = fmul float %9, %7
+  store float %mul35, float* %arrayidx21, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %arrayidx41 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %indvars.iv.next
+  %10 = load float* %arrayidx41, align 4
+  %mul45 = fmul float %10, %9
+  store float %mul45, float* %arrayidx31, align 4
+  %11 = trunc i64 %indvars.iv.next to i32
+  %cmp2 = icmp slt i32 %11, 31995
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/opt.ll b/test/Transforms/SLPVectorizer/X86/opt.ll
new file mode 100644
index 0000000000000..14137c11ee414
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/opt.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -O3 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=SLP
+; RUN: opt < %s -O3 -disable-slp-vectorization -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSLP
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure we can disable slp vectorization in opt.
+
+; SLP-LABEL: test1
+; SLP: store <2 x double>
+
+; NOSLP-LABEL: test1
+; NOSLP-NOT: store <2 x double>
+
+
+define void @test1(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/ordering.ll b/test/Transforms/SLPVectorizer/X86/ordering.ll
new file mode 100644
index 0000000000000..d2ecd4546ddb7
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/ordering.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @updateModelQPFrame(i32 %m_Bits) {
+entry:
+  %0 = load double* undef, align 8
+  %mul = fmul double undef, %0
+  %mul2 = fmul double undef, %mul
+  %mul4 = fmul double %0, %mul2
+  %mul5 = fmul double undef, 4.000000e+00
+  %mul7 = fmul double undef, %mul5
+  %conv = sitofp i32 %m_Bits to double
+  %mul8 = fmul double %conv, %mul7
+  %add = fadd double %mul4, %mul8
+  %cmp11 = fcmp olt double %add, 0.000000e+00
+  ret void
+}
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i32 @personality_v0(...)
+
+define void @invoketest() {
+entry:
+  br i1 undef, label %cond.true, label %cond.false
+
+cond.true:
+  %call49 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef) 
+          to label %cond.true54 unwind label %lpad
+
+cond.false:
+  %call51 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %cond.false57 unwind label %lpad
+
+cond.true54:
+  %call56 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef) 
+          to label %cond.end60 unwind label %lpad
+
+cond.false57:
+  %call59 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %cond.end60 unwind label %lpad
+
+; Make sure we don't vectorize these phis - they have invokes as inputs.
+
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+; CHECK-LABEL: invoketest
+
+; CHECK-LABEL: cond.end60
+; CHECK-NEXT-NOT: phi <2 x double>
+; CHECK: insertelement
+; CHECK-LABEL: if.then63
+
+cond.end60:
+  %cond126 = phi double [ %call49, %cond.true54 ], [ %call51, %cond.false57 ]
+  %cond61 = phi double [ %call56, %cond.true54 ], [ %call59, %cond.false57 ]
+  br i1 undef, label %if.end98, label %if.then63
+
+if.then63:
+  %conv69 = fptrunc double undef to float
+  %conv70 = fpext float %conv69 to double
+  %div71 = fdiv double %cond126, %conv70
+  %conv78 = fptrunc double undef to float
+  %conv79 = fpext float %conv78 to double
+  %div80 = fdiv double %cond61, %conv79
+  br label %if.end98
+
+lpad:
+  %l = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @personality_v0 to i8*)
+          cleanup
+  resume { i8*, i32 } %l
+
+if.end98:
+  %dimensionsResult.sroa.0.0 = phi double [ %div71, %if.then63 ], [ %cond126, %cond.end60 ]
+  %dimensionsResult.sroa.6.0 = phi double [ %div80, %if.then63 ], [ %cond61, %cond.end60 ]
+  br label %if.end99
+
+if.end99:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll
new file mode 100644
index 0000000000000..964e0e4efee7e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -0,0 +1,248 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+;int foo(double *A, int k) {
+;  double A0;
+;  double A1;
+;  if (k) {
+;    A0 = 3;
+;    A1 = 5;
+;  } else {
+;    A0 = A[10];
+;    A1 = A[11];
+;  }
+;  A[0] = A0;
+;  A[1] = A1;
+;}
+
+
+;CHECK: i32 @foo
+;CHECK: load <2 x double>
+;CHECK: phi <2 x double>
+;CHECK: store <2 x double>
+;CHECK: ret i32 undef
+define i32 @foo(double* nocapture %A, i32 %k) {
+entry:
+  %tobool = icmp eq i32 %k, 0
+  br i1 %tobool, label %if.else, label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds double* %A, i64 10
+  %0 = load double* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double* %A, i64 11
+  %1 = load double* %arrayidx1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.else
+  %A0.0 = phi double [ %0, %if.else ], [ 3.000000e+00, %entry ]
+  %A1.0 = phi double [ %1, %if.else ], [ 5.000000e+00, %entry ]
+  store double %A0.0, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double* %A, i64 1
+  store double %A1.0, double* %arrayidx3, align 8
+  ret i32 undef
+}
+
+
+;int foo(double * restrict B,  double * restrict A, int n, int m) {
+;  double R=A[1];
+;  double G=A[0];
+;  for (int i=0; i < 100; i++) {
+;    R += 10;
+;    G += 10;
+;    R *= 4;
+;    G *= 4;
+;    R += 4;
+;    G += 4;
+;  }
+;  B[0] = G;
+;  B[1] = R;
+;  return 0;
+;}
+
+;CHECK: foo2
+;CHECK: load <2 x double>
+;CHECK: phi <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: store <2 x double>
+;CHECK: ret
+define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32 %n, i32 %m) #0 {
+entry:
+  %arrayidx = getelementptr inbounds double* %A, i64 1
+  %0 = load double* %arrayidx, align 8
+  %1 = load double* %A, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.019 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %G.018 = phi double [ %1, %entry ], [ %add5, %for.body ]
+  %R.017 = phi double [ %0, %entry ], [ %add4, %for.body ]
+  %add = fadd double %R.017, 1.000000e+01
+  %add2 = fadd double %G.018, 1.000000e+01
+  %mul = fmul double %add, 4.000000e+00
+  %mul3 = fmul double %add2, 4.000000e+00
+  %add4 = fadd double %mul, 4.000000e+00
+  %add5 = fadd double %mul3, 4.000000e+00
+  %inc = add nsw i32 %i.019, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store double %add5, double* %B, align 8
+  %arrayidx7 = getelementptr inbounds double* %B, i64 1
+  store double %add4, double* %arrayidx7, align 8
+  ret i32 0
+}
+
+; float foo3(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   float Y = A[3];
+;   float P = A[4];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;     Y+=A[i+3]*10;
+;     P+=A[i+4]*11;
+;   }
+;
+;   return R+G+B+Y+P;
+; }
+
+;CHECK: foo3
+;CHECK: phi <4 x float>
+;CHECK: fmul <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK-NOT: phi <5 x float>
+;CHECK-NOT: fmul <5 x float>
+;CHECK-NOT: fadd <5 x float>
+
+define float @foo3(float* nocapture readonly %A) #0 {
+entry:
+  %0 = load float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float* %A, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float* %A, i64 2
+  %2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %A, i64 3
+  %3 = load float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %A, i64 4
+  %4 = load float* %arrayidx4, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %P.056 = phi float [ %4, %entry ], [ %add26, %for.body ]
+  %Y.055 = phi float [ %3, %entry ], [ %add21, %for.body ]
+  %B.054 = phi float [ %2, %entry ], [ %add16, %for.body ]
+  %G.053 = phi float [ %1, %entry ], [ %add11, %for.body ]
+  %R.052 = phi float [ %0, %entry ], [ %add6, %for.body ]
+  %5 = phi float [ %1, %entry ], [ %11, %for.body ]
+  %6 = phi float [ %0, %entry ], [ %9, %for.body ]
+  %mul = fmul float %6, 7.000000e+00
+  %add6 = fadd float %R.052, %mul
+  %mul10 = fmul float %5, 8.000000e+00
+  %add11 = fadd float %G.053, %mul10
+  %7 = add nsw i64 %indvars.iv, 2
+  %arrayidx14 = getelementptr inbounds float* %A, i64 %7
+  %8 = load float* %arrayidx14, align 4
+  %mul15 = fmul float %8, 9.000000e+00
+  %add16 = fadd float %B.054, %mul15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds float* %A, i64 %indvars.iv.next
+  %9 = load float* %arrayidx19, align 4
+  %mul20 = fmul float %9, 1.000000e+01
+  %add21 = fadd float %Y.055, %mul20
+  %10 = add nsw i64 %indvars.iv, 4
+  %arrayidx24 = getelementptr inbounds float* %A, i64 %10
+  %11 = load float* %arrayidx24, align 4
+  %mul25 = fmul float %11, 1.100000e+01
+  %add26 = fadd float %P.056, %mul25
+  %12 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %12, 121
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float %add6, %add11
+  %add29 = fadd float %add28, %add16
+  %add30 = fadd float %add29, %add21
+  %add31 = fadd float %add30, %add26
+  ret float %add31
+}
+
+; Make sure the order of phi nodes of different types does not prevent
+; vectorization of same typed phi nodes.
+; CHECK-LABEL: sort_phi_type
+; CHECK: phi <4 x float>
+; CHECK: fmul <4 x float>
+
+define float @sort_phi_type(float* nocapture readonly %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %Y = phi float [ 1.000000e+01, %entry ], [ %mul10, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %B = phi float [ 1.000000e+01, %entry ], [ %mul15, %for.body ]
+  %G = phi float [ 1.000000e+01, %entry ], [ %mul20, %for.body ]
+  %R = phi float [ 1.000000e+01, %entry ], [ %mul25, %for.body ]
+  %mul10 = fmul float %Y, 8.000000e+00
+  %mul15 = fmul float %B, 9.000000e+00
+  %mul20 = fmul float %R, 10.000000e+01
+  %mul25 = fmul float %G, 11.100000e+01
+  %indvars.iv.next = add nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float 1.000000e+01, %mul10
+  %add29 = fadd float %mul10, %mul15
+  %add30 = fadd float %add29, %mul20
+  %add31 = fadd float %add30, %mul25
+  ret float %add31
+}
+
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+;
+; We disable the vectorization of x86_fp80 for now. 
+
+entry:
+  %i1.0 = load x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80* %i1.gep1, align 16
+; CHECK: load x86_fp80*
+; CHECK: load x86_fp80*
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK_NOT: insertelement <2 x x86_fp80>
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80* %i2.gep1, align 16
+; CHECK: load x86_fp80*
+; CHECK: load x86_fp80*
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
+  br label %end
+
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK-NOT: phi <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/phi3.ll b/test/Transforms/SLPVectorizer/X86/phi3.ll
new file mode 100644
index 0000000000000..fd8d361372014
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi3.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.GPar.0.16.26 = type { [0 x double], double }
+
+@d = external global double, align 8
+
+declare %struct.GPar.0.16.26* @Rf_gpptr(...)
+
+define void @Rf_GReset() {
+entry:
+  %sub = fsub double -0.000000e+00, undef
+  %0 = load double* @d, align 8
+  %sub1 = fsub double -0.000000e+00, %0
+  br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label %if.then, label %if.end7
+
+if.then:                                          ; preds = %entry
+  %sub2 = fsub double %sub, undef
+  %div.i = fdiv double %sub2, undef
+  %sub4 = fsub double %sub1, undef
+  %div.i16 = fdiv double %sub4, undef
+  %cmp = fcmp ogt double %div.i, %div.i16
+  br i1 %cmp, label %if.then6, label %if.end7
+
+if.then6:                                         ; preds = %if.then
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.then6, %if.then, %entry
+  %g.0 = phi double [ 0.000000e+00, %if.then6 ], [ %sub, %if.then ], [ %sub, %entry ]
+  ret void
+}
+
+
diff --git a/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll b/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
new file mode 100644
index 0000000000000..6d2d5e3540c7d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -disable-output
+
+target datalayout = "f64:64:64-v64:64:64"
+
+define void @test_phi_in_landingpad() {
+entry:
+  invoke void @foo()
+          to label %inner unwind label %lpad
+
+inner:
+  %x0 = fsub double undef, undef
+  %y0 = fsub double undef, undef
+  invoke void @foo()
+          to label %done unwind label %lpad
+
+lpad:
+  %x1 = phi double [ undef, %entry ], [ undef, %inner ]
+  %y1 = phi double [ undef, %entry ], [ undef, %inner ]
+  landingpad { i8*, i32 } personality i8*
+          bitcast (i32 (...)* @__gxx_personality_v0 to i8*) catch i8* null
+  br label %done
+
+done:
+  phi double [ %x0, %inner ], [ %x1, %lpad ]
+  phi double [ %y0, %inner ], [ %y1, %lpad ]
+  ret void
+}
+
+declare void @foo()
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
new file mode 100644
index 0000000000000..520e6729de0c6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; We purposely over-align f64 to 128bit here. 
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:128:128-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+
+define void @test(double* %i1, double* %i2, double* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+  %i1.0 = load double* %i1, align 16
+  %i1.gep1 = getelementptr double* %i1, i64 1
+  %i1.1 = load double* %i1.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds double* %i2, i64 0
+  %i2.0 = load double* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds double* %i2, i64 1
+  %i2.1 = load double* %i2.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+  br label %end
+
+end:
+  %phi0 = phi double [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi double [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK: phi <2 x double>
+; CHECK: extractelement <2 x double>
+; CHECK: extractelement <2 x double>
+  store double %phi0, double* %o, align 16
+  %o.gep1 = getelementptr inbounds double* %o, i64 1
+  store double %phi1, double* %o.gep1, align 16
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/pr16571.ll b/test/Transforms/SLPVectorizer/X86/pr16571.ll
new file mode 100644
index 0000000000000..13d82149c0c54
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr16571.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i686-pc-win32 -mcpu=corei7-avx
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i686-pc-win32"
+
+define hidden fastcc void @"System.PrimitiveTypesParser.TryParseIEEE754<char>(char*,uint,double&)"() unnamed_addr {
+"@0":
+  br i1 undef, label %"@38.lr.ph", label %"@37"
+
+"@37":                                            ; preds = %"@38.lr.ph", %"@44", %"@0"
+  ret void
+
+"@44":                                            ; preds = %"@38.lr.ph"
+  %0 = add i64 undef, undef
+  %1 = add i32 %mainPartDigits.loc.0.ph45, 1
+  br i1 undef, label %"@38.lr.ph", label %"@37"
+
+"@38.lr.ph":                                      ; preds = %"@44", %"@0"
+  %mainDoublePart.loc.0.ph46 = phi i64 [ %0, %"@44" ], [ 0, %"@0" ]
+  %mainPartDigits.loc.0.ph45 = phi i32 [ %1, %"@44" ], [ 0, %"@0" ]
+  br i1 undef, label %"@44", label %"@37"
+}
diff --git a/test/Transforms/SLPVectorizer/X86/pr16628.ll b/test/Transforms/SLPVectorizer/X86/pr16628.ll
new file mode 100644
index 0000000000000..3f9d775eeeb6c
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr16628.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@c = common global i32 0, align 4
+@a = common global i16 0, align 2
+@b = common global i16 0, align 2
+
+; Function Attrs: nounwind ssp uwtable
+define void @f() {
+entry:
+  %call = tail call i32 (...)* @g()
+  %0 = load i32* @c, align 4
+  %lnot = icmp eq i32 %0, 0
+  %lnot.ext = zext i1 %lnot to i32
+  %1 = load i16* @a, align 2
+  %lnot2 = icmp eq i16 %1, 0
+  %lnot.ext3 = zext i1 %lnot2 to i32
+  %or = or i32 %lnot.ext3, %lnot.ext
+  %cmp = icmp eq i32 %call, %or
+  %conv4 = zext i1 %cmp to i16
+  store i16 %conv4, i16* @b, align 2
+  ret void
+}
+
+declare i32 @g(...)
diff --git a/test/Transforms/SLPVectorizer/X86/pr16899.ll b/test/Transforms/SLPVectorizer/X86/pr16899.ll
new file mode 100644
index 0000000000000..8631bc9125df2
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s  -slp-vectorizer -S -mtriple=i386--netbsd -mcpu=i486
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+@a = common global i32* null, align 4
+
+; Function Attrs: noreturn nounwind readonly
+define i32 @fn1() #0 {
+entry:
+  %0 = load i32** @a, align 4, !tbaa !4
+  %1 = load i32* %0, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32* %0, i32 1
+  %2 = load i32* %arrayidx1, align 4, !tbaa !5
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %c.0 = phi i32 [ %2, %entry ], [ %add2, %do.body ]
+  %b.0 = phi i32 [ %1, %entry ], [ %add, %do.body ]
+  %add = add nsw i32 %b.0, %c.0
+  %add2 = add nsw i32 %add, 1
+  br label %do.body
+}
+
+attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"int", metadata !1}
+!4 = metadata !{metadata !0, metadata !0, i64 0}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/Transforms/SLPVectorizer/X86/pr18060.ll b/test/Transforms/SLPVectorizer/X86/pr18060.ll
new file mode 100644
index 0000000000000..e6813f3b315d5
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr18060.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i386-pc-linux
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-pc-linux"
+
+; Function Attrs: nounwind
+define i32 @_Z16adjustFixupValueyj(i64 %Value, i32 %Kind) {
+entry:
+  %extract.t = trunc i64 %Value to i32
+  %extract = lshr i64 %Value, 12
+  %extract.t6 = trunc i64 %extract to i32
+  switch i32 %Kind, label %sw.default [
+    i32 0, label %return
+    i32 1, label %return
+    i32 129, label %sw.bb1
+    i32 130, label %sw.bb2
+  ]
+
+sw.default:                                       ; preds = %entry
+  call void @_Z25llvm_unreachable_internalv()
+  unreachable
+
+sw.bb1:                                           ; preds = %entry
+  %shr = lshr i64 %Value, 16
+  %extract.t5 = trunc i64 %shr to i32
+  %extract7 = lshr i64 %Value, 28
+  %extract.t8 = trunc i64 %extract7 to i32
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %sw.bb1, %entry
+  %Value.addr.0.off0 = phi i32 [ %extract.t, %entry ], [ %extract.t5, %sw.bb1 ]
+  %Value.addr.0.off12 = phi i32 [ %extract.t6, %entry ], [ %extract.t8, %sw.bb1 ]
+  %conv6 = and i32 %Value.addr.0.off0, 4095
+  %conv4 = shl i32 %Value.addr.0.off12, 16
+  %shl = and i32 %conv4, 983040
+  %or = or i32 %shl, %conv6
+  %or11 = or i32 %or, 8388608
+  br label %return
+
+return:                                           ; preds = %sw.bb2, %entry, %entry
+  %retval.0 = phi i32 [ %or11, %sw.bb2 ], [ %extract.t, %entry ], [ %extract.t, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: noreturn
+declare void @_Z25llvm_unreachable_internalv()
+
diff --git a/test/Transforms/SLPVectorizer/X86/reduction2.ll b/test/Transforms/SLPVectorizer/X86/reduction2.ll
index 7aa7d7e243d08..f21e86c5646c3 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction2.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.8.0"
 
-;CHECK: @foo
+;CHECK-LABEL: @foo(
 ;CHECK: load <2 x double>
 ;CHECK: ret
 define double @foo(double* nocapture %D) {
@@ -16,11 +16,13 @@ define double @foo(double* nocapture %D) {
   %3 = getelementptr inbounds double* %D, i32 %2
   %4 = load double* %3, align 4
   %A4 = fmul double %4, %4
+  %A42 = fmul double %A4, %A4
   %5 = or i32 %2, 1
   %6 = getelementptr inbounds double* %D, i32 %5
   %7 = load double* %6, align 4
   %A7 = fmul double %7, %7
-  %8 = fadd double %A4, %A7
+  %A72 = fmul double %A7, %A7
+  %8 = fadd double %A42, %A72
   %9 = fadd double %sum.01, %8
   %10 = add nsw i32 %i.02, 1
   %exitcond = icmp eq i32 %10, 100
diff --git a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
new file mode 100644
index 0000000000000..6aea5d3c6f6bd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+; We disable the vectorization of <3 x float> for now
+
+; float foo(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;   }
+;
+;   return R+G+B;
+; }
+
+;CHECK-LABEL: @foo(
+;CHECK: br
+;CHECK-NOT: phi <3 x float>
+;CHECK-NOT: fmul <3 x float>
+;CHECK-NOT: fadd <3 x float>
+; At the moment we don't sink extractelements.
+;CHECK: br
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
+;CHECK: ret
+
+define float @foo(float* nocapture readonly %A) {
+entry:
+  %0 = load float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float* %A, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float* %A, i64 2
+  %2 = load float* %arrayidx2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %3 = phi float [ %0, %entry ], [ %.pre, %for.body.for.body_crit_edge ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
+  %B.032 = phi float [ %2, %entry ], [ %add14, %for.body.for.body_crit_edge ]
+  %G.031 = phi float [ %1, %entry ], [ %add9, %for.body.for.body_crit_edge ]
+  %R.030 = phi float [ %0, %entry ], [ %add4, %for.body.for.body_crit_edge ]
+  %mul = fmul float %3, 7.000000e+00
+  %add4 = fadd float %R.030, %mul
+  %4 = add nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds float* %A, i64 %4
+  %5 = load float* %arrayidx7, align 4
+  %mul8 = fmul float %5, 8.000000e+00
+  %add9 = fadd float %G.031, %mul8
+  %6 = add nsw i64 %indvars.iv, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %6
+  %7 = load float* %arrayidx12, align 4
+  %mul13 = fmul float %7, 9.000000e+00
+  %add14 = fadd float %B.032, %mul13
+  %indvars.iv.next = add i64 %indvars.iv, 3
+  %8 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %8, 121
+  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %arrayidx3.phi.trans.insert = getelementptr inbounds float* %A, i64 %indvars.iv.next
+  %.pre = load float* %arrayidx3.phi.trans.insert, align 4
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  %add16 = fadd float %add4, %add9
+  %add17 = fadd float %add16, %add14
+  ret float %add17
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll
index b520913a398d1..46263416a90b1 100644
--- a/test/Transforms/SLPVectorizer/X86/saxpy.ll
+++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll
@@ -43,3 +43,19 @@ define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a,
   ret void
 }
 
+; Make sure we don't crash on this one.
+define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i64 %i) {
+  %1 = add i64 %i, 1
+  %2 = getelementptr inbounds i32* %x, i64 %1
+  %3 = getelementptr inbounds i32* %y, i64 %1
+  %4 = load i32* %3, align 4
+  %5 = add nsw i32 undef, %4
+  store i32 %5, i32* %2, align 4
+  %6 = add i64 %i, 2
+  %7 = getelementptr inbounds i32* %x, i64 %6
+  %8 = getelementptr inbounds i32* %y, i64 %6
+  %9 = load i32* %8, align 4
+  %10 = add nsw i32 undef, %9
+  store i32 %10, i32* %7, align 4
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/simplebb.ll b/test/Transforms/SLPVectorizer/X86/simplebb.ll
index cd0b99e646773..7d682e5e46763 100644
--- a/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ b/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -23,3 +23,67 @@ entry:
   ret void
 }
 
+; Simple 3-pair chain with loads and stores, obfuscated with bitcasts
+; CHECK: test2
+; CHECK: store <2 x double>
+; CHECK: ret
+define void @test2(double* %a, double* %b, i8* %e) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %c = bitcast i8* %e to double*
+  store double %mul, double* %c, align 8
+  %carrayidx5 = getelementptr inbounds i8* %e, i64 8
+  %arrayidx5 = bitcast i8* %carrayidx5 to double*
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile loads.
+; CHECK: test_volatile_load
+; CHECK-NOT: load <2 x double>
+; CHECK: store <2 x double>
+; CHECK: ret
+define void @test_volatile_load(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load volatile double* %a, align 8
+  %i1 = load volatile double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile stores.
+; CHECK: test_volatile_store
+; CHECK-NOT: store <2 x double>
+; CHECK: ret
+define void @test_volatile_store(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store volatile double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store volatile double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+
diff --git a/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
new file mode 100644
index 0000000000000..2747a1f489977
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -0,0 +1,140 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+
+; CHECK: tiny_tree_fully_vectorizable
+; CHECK: load <2 x double>
+; CHECK: store <2 x double>
+; CHECK: ret 
+
+define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double* %src.addr.013, i64 1
+  %1 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %dst.addr.014, i64 1
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK: tiny_tree_fully_vectorizable2
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: ret
+
+define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float* %src.addr.021, i64 1
+  %1 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %src.addr.021, i64 2
+  %2 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float* %src.addr.021, i64 3
+  %3 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; We do not vectorize the tiny tree which is not fully vectorizable. 
+; CHECK: tiny_tree_not_fully_vectorizable
+; CHECK-NOT: <2 x double>
+; CHECK: ret 
+
+define void @tiny_tree_not_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double* %src.addr.013, i64 2
+  %1 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %dst.addr.014, i64 1 
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+; CHECK: tiny_tree_not_fully_vectorizable2
+; CHECK-NOT: <2 x double>
+; CHECK: ret
+
+define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float* %src.addr.021, i64 4 
+  %1 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %src.addr.021, i64 2
+  %2 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float* %src.addr.021, i64 3
+  %3 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}