8 files changed, 373 insertions, 437 deletions
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 21b59f87d042..37a6d4e79984 100644
--- a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,40 +1,55 @@
-; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
-; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-; CHECK-LABEL: predicated_udiv_scalarized_operand
-;
 ; This test checks that we correctly compute the scalarized operands for a
 ; user-specified vectorization factor when interleaving is disabled. We use the
-; "optsize" attribute to disable all interleaving calculations.
+; "optsize" attribute to disable all interleaving calculations.  A cost of 4
+; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
+; %tmp4 a lower scalarization overhead.
 ;
-; CHECK: vector.body:
-; CHECK:   %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
-; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
-; CHECK: [[IF0]]:
-; CHECK:   %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK:   %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK:   %[[T02:.+]] = add nsw i64 %[[T01]], %x
-; CHECK:   %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
-; CHECK:   %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
-; CHECK:   br label %[[CONT0]]
-; CHECK: [[CONT0]]:
-; CHECK:   %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
-; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
-; CHECK: [[IF1]]:
-; CHECK:   %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK:   %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK:   %[[T08:.+]] = add nsw i64 %[[T07]], %x
-; CHECK:   %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
-; CHECK:   %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
-; CHECK:   br label %[[CONT1]]
-; CHECK: [[CONT1]]:
-; CHECK:   phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
-; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+; COST-LABEL:  predicated_udiv_scalarized_operand
+; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+;
+; CHECK-LABEL: @predicated_udiv_scalarized_operand(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_UDIV_IF]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
+; CHECK:       [[PRED_UDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_IF1]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
 entry:
   br label %for.body
 
@@ -43,7 +58,8 @@ for.body:
   %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
   %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
   %tmp2 = load i64, i64* %tmp0, align 4
-  br i1 %c, label %if.then, label %for.inc
+  %cond0 = icmp sgt i64 %tmp2, 0
+  br i1 %cond0, label %if.then, label %for.inc
 
 if.then:
   %tmp3 = add nsw i64 %tmp2, %x
@@ -54,8 +70,8 @@ for.inc:
   %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
   %tmp6 = add i64 %r, %tmp5
   %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, 100
-  br i1 %cond, label %for.body, label %for.end
+  %cond1 = icmp slt i64 %i.next, 100
+  br i1 %cond1, label %for.body, label %for.end
 
 for.end:
   %tmp7 = phi i64 [ %tmp6, %for.inc ]
diff --git a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
deleted file mode 100644
index fc68adb59df3..000000000000
--- a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ /dev/null
@@ -1,341 +0,0 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: @recurrence_1
-;
-; void recurrence_1(int *a, int *b, int n) {
-;   for(int i = 0; i < n; i++)
-;     b[i] =  a[i] + a[i - 1]
-; }
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i32>
-; CHECK:    {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL:   [[L2]] = load <4 x i32>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
-entry:
-  br label %for.preheader
-
-for.preheader:
-  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
-  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
-  br label %scalar.body
-
-scalar.body:
-  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
-  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-  %1 = load i32, i32* %arrayidx32
-  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %add35 = add i32 %1, %0
-  store i32 %add35, i32* %arrayidx34
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.exit, label %scalar.body
-
-for.exit:
-  ret void
-}
-
-; CHECK-LABEL: @recurrence_2
-;
-; int recurrence_2(int *a, int n) {
-;   int minmax;
-;   for (int i = 0; i < n; ++i)
-;     minmax = min(minmax, max(a[i] - a[i-1], 0));
-;   return minmax;
-; }
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i32>
-; CHECK:    {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL:   [[L2]] = load <4 x i32>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
-entry:
-  %cmp27 = icmp sgt i32 %n, 0
-  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
-
-for.preheader:
-  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
-  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
-  br label %scalar.body
-
-for.cond.cleanup.loopexit:
-  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %minmax.0.lcssa
-
-scalar.body:
-  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
-  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx, align 4
-  %sub3 = sub nsw i32 %1, %0
-  %cmp4 = icmp sgt i32 %sub3, 0
-  %cond = select i1 %cmp4, i32 %sub3, i32 0
-  %cmp5 = icmp slt i32 %minmax.028, %cond
-  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
-}
-
-; CHECK-LABEL: @recurrence_3
-;
-; void recurrence_3(short *a, double *b, int n, float f, short p) {
-;   b[0] = (double)a[0] - f * (double)p;
-;   for (int i = 1; i < n; i++)
-;     b[i] = (double)a[i] - f * (double)a[i - 1];
-; }
-;
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i16>
-; CHECK:    {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
-; UNROLL:   [[L2]] = load <4 x i16>
-; UNROLL:   {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
-;
-define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
-entry:
-  %0 = load i16, i16* %a, align 2
-  %conv = sitofp i16 %0 to double
-  %conv1 = fpext float %f to double
-  %conv2 = sitofp i16 %p to double
-  %mul = fmul fast double %conv2, %conv1
-  %sub = fsub fast double %conv, %mul
-  store double %sub, double* %b, align 8
-  %cmp25 = icmp sgt i32 %n, 1
-  br i1 %cmp25, label %for.preheader, label %for.end
-
-for.preheader:
-  br label %scalar.body
-
-scalar.body:
-  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
-  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
-  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
-  %2 = load i16, i16* %arrayidx5, align 2
-  %conv6 = sitofp i16 %2 to double
-  %conv11 = sitofp i16 %1 to double
-  %mul12 = fmul fast double %conv11, %conv1
-  %sub13 = fsub fast double %conv6, %mul12
-  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
-  store double %sub13, double* %arrayidx15, align 8
-  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
-  %lftr.wideiv = trunc i64 %advars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: @PR26734
-;
-; void PR26734(short *a, int *b, int *c, int d, short *e) {
-;   for (; d != 21; d++) {
-;     *b &= *c;
-;     *e = *a - 6;
-;     *c = *e;
-;   }
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
-entry:
-  %cmp4 = icmp eq i32 %d, 21
-  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
-
-entry.for.end_crit_edge:
-  %.pre = load i32, i32* %b, align 4
-  br label %for.end
-
-for.body.lr.ph:
-  %0 = load i16, i16* %a, align 2
-  %sub = add i16 %0, -6
-  %conv2 = sext i16 %sub to i32
-  %c.promoted = load i32, i32* %c, align 4
-  %b.promoted = load i32, i32* %b, align 4
-  br label %for.body
-
-for.body:
-  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
-  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
-  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
-  %and = and i32 %and6, %conv25
-  %inc = add nsw i32 %inc7, 1
-  %cmp = icmp eq i32 %inc, 21
-  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
-
-for.cond.for.end_crit_edge:
-  %and.lcssa = phi i32 [ %and, %for.body ]
-  store i32 %conv2, i32* %c, align 4
-  store i32 %and.lcssa, i32* %b, align 4
-  store i16 %sub, i16* %e, align 2
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: @PR27246
-;
-; int PR27246() {
-;   unsigned int e, n;
-;   for (int i = 1; i < 49; ++i) {
-;     for (int k = i; k > 1; --k)
-;       e = k;
-;     n = e;
-;   }
-;   return n;
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define i32 @PR27246() {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
-  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
-  br label %for.cond1
-
-for.cond.cleanup:
-  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
-  ret i32 %e.1.lcssa.lcssa
-
-for.cond1:
-  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
-  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
-  %cmp2 = icmp sgt i32 %k.0, 1
-  %dec = add nsw i32 %k.0, -1
-  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
-
-for.cond.cleanup3:
-  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
-  %inc = add nuw nsw i32 %i.016, 1
-  %exitcond = icmp eq i32 %inc, 49
-  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
-}
-
-; CHECK-LABEL: @PR29559
-;
-; UNROLL-NO-IC: vector.ph:
-; UNROLL-NO-IC:   br label %vector.body
-;
-; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; UNROLL-NO-IC:   %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ]
-; UNROLL-NO-IC:   %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-; UNROLL-NO-IC:   %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0
-; UNROLL-NO-IC:   %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1
-; UNROLL-NO-IC:   %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2
-; UNROLL-NO-IC:   %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3
-; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3
-;
-; UNROLL-NO-IC: scalar.ph:
-; UNROLL-NO-IC:   %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ]
-;
-; UNROLL-NO-IC: scalar.body:
-; UNROLL-NO-IC:   %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-define void @PR29559() {
-entry:
-  br label %scalar.body
-
-scalar.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
-  %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ]
-  %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, undef
-  br i1 %cond, label %for.end, label %scalar.body
-
-for.end:
-  ret void
-}
diff --git a/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
new file mode 100644
index 000000000000..e8ef42562356
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @non_primary_iv_trunc_free(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @non_primary_iv_trunc_free(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
new file mode 100644
index 000000000000..0ebb7a92edae
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i8, i8* %tmp0, align 1
+  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index df1f9c619408..54ee8fc6e73f 100644
--- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -1,81 +1,189 @@
-; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
 
-@AB = common global [1024 x i8] zeroinitializer, align 4
-@CD = common global [1024 x i8] zeroinitializer, align 4
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
 
-define void @test_byte_interleaved_cost(i8 %C, i8 %D) {
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
-; access group is 2.
-
-; CHECK: LV: Checking a loop in "test_byte_interleaved_cost"
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv
-  %tmp = load i8, i8* %arrayidx0, align 4
-  %tmp1 = or i64 %indvars.iv, 1
-  %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1
-  %tmp2 = load i8, i8* %arrayidx1, align 4
-  %add = add nsw i8 %tmp, %C
-  %mul = mul nsw i8 %tmp2, %D
-  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1
-  store i8 %mul, i8* %arrayidx3, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, 1024
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
   ret void
 }
 
-%ig.factor.8 = type { double*, double, double, double, double, double, double, double }
-define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) {
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; Check the default cost of a strided load with a factor that is greater than
-; the maximum allowed. In this test, the interleave factor would be 8, which is
-; not supported.
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
-; CHECK: LV: Checking a loop in "wide_interleaved_group"
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %1 = load double, double* %0, align 8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %5 = load double, double* %4, align 8
-; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %9, double* %10, align 8
+%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
+define void @i64_factor_8(%i64.8* %data, i64 %n) {
+entry:
+  br label %for.body
 
+; The interleave factor in this test is 8, which is greater than the maximum
+; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
+; implementation for determining the cost of the interleaved load group. The
+; stores do not form a legal interleaved group because the group would contain
+; gaps.
+;
+; VF_2-LABEL: Checking a loop in "i64_factor_8"
+; VF_2:         Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ]
-  %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2
-  %1 = load double, double* %0, align 8
-  %2 = fcmp fast olt double %1, %a
-  %3 = select i1 %2, double 0.000000e+00, double %1
-  %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6
-  %5 = load double, double* %4, align 8
-  %6 = fcmp fast olt double %5, %a
-  %7 = select i1 %6, double 0.000000e+00, double %5
-  %8 = fmul fast double %7, %b
-  %9 = fadd fast double %8, %3
-  %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3
-  store double %9, double* %10, align 8
-  %11 = fmul fast double %9, %9
-  %12 = fadd fast double %11, %r
+  %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
+  %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
-  %13 = trunc i64 %i.next to i32
-  %cond = icmp eq i32 %13, %n
-  br i1 %cond, label %for.exit, label %for.body
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
 
-for.exit:
-  %r.lcssa = phi double [ %12, %for.body ]
-  ret double %r.lcssa
+for.end:
+  ret void
 }
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index c7ced757581a..d06e3fdba39c 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -234,12 +234,27 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: @add_phifail2(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i32>
-; CHECK: store <16 x i8>
 ; Function Attrs: nounwind
+; When we vectorize this loop, we generate correct code
+; even when %len exactly divides VF (since we extract from the second last index
+; and pass this to the for.cond.cleanup block). Vectorized loop returns 
+; the correct value a_phi = p[len -2]
 define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+; CHECK-LABEL: @add_phifail2(
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <16 x i8>, <16 x i8>*
+; CHECK:   %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
+; CHECK:   add nuw nsw <16 x i32>
+; CHECK:   store <16 x i8>
+; CHECK:   add i64 %index, 16
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
+; CHECK:   %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
+; CHECK: for.cond.cleanup:
+; CHECK:   %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK:   %ret = trunc i32 %a_phi.lcssa to i8
+; CHECK:   ret i8 %ret
 entry:
   br label %for.body
 
diff --git a/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
new file mode 100644
index 000000000000..5ea38a4a246d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
+
+; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
+; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
+; datalayout to reproduce the fault.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
+
+; Check that all the loads are scalarized
+; CHECK: load i16, i16*
+; CHECK: load i16, i16*
+; CHECK: load i16, i16 addrspace(4)*
+; CHECK: load i16, i16 addrspace(4)*
+
+%rec1445 = type { i16, i16, i16, i16, i16 }
+
+define void @foo() {
+bb1:
+  br label %bb4
+
+bb4:
+  %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
+  %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
+  %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
+  %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
+  %_tmp987 = load i16, i16* %0, align 1
+  %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
+  %_tmp993 = load i16, i16 addrspace(4)* %1, align 1
+  %_tmp1013 = add i16 %tmp1, 1
+  %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
+  %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
+  %_tmp1019 = icmp ult i16 %_tmp1013, 24
+  br i1 %_tmp1019, label %bb4, label %bb16
+
+bb16:
+  unreachable
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
new file mode 100644
index 000000000000..1ae7dadeffd7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK:         The Smallest and Widest types: 64 / 64 bits
+;
+define void @interleaved_access(i8** %A, i64 %N) {
+for.ph:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
+  %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
+  store i8* null, i8** %tmp0, align 8
+  %i.next.0 = add nuw nsw i64 %i, 1
+  %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
+  store i8* null, i8** %tmp1, align 8
+  %i.next.1 = add nsw i64 %i, 2
+  %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
+  store i8* null, i8** %tmp2, align 8
+  %i.next.2 = add nsw i64 %i, 3
+  %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
+  store i8* null, i8** %tmp3, align 8
+  %i.next.3 = add nsw i64 %i, 4
+  %cond = icmp slt i64 %i.next.3, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}