aboutsummaryrefslogtreecommitdiff
path: root/test/Transforms/LoopVectorize/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'test/Transforms/LoopVectorize/AArch64')
-rw-r--r--test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll82
-rw-r--r--test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll341
-rw-r--r--test/Transforms/LoopVectorize/AArch64/induction-trunc.ll30
-rw-r--r--test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll38
-rw-r--r--test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll226
-rw-r--r--test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll23
-rw-r--r--test/Transforms/LoopVectorize/AArch64/pr31900.ll37
-rw-r--r--test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll33
8 files changed, 373 insertions, 437 deletions
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 21b59f87d042..37a6d4e79984 100644
--- a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,40 +1,55 @@
-; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
-; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
-; CHECK-LABEL: predicated_udiv_scalarized_operand
-;
; This test checks that we correctly compute the scalarized operands for a
; user-specified vectorization factor when interleaving is disabled. We use the
-; "optsize" attribute to disable all interleaving calculations.
+; "optsize" attribute to disable all interleaving calculations. A cost of 4
+; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
+; %tmp4 a lower scalarization overhead.
;
-; CHECK: vector.body:
-; CHECK: %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
-; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
-; CHECK: [[IF0]]:
-; CHECK: %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK: %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK: %[[T02:.+]] = add nsw i64 %[[T01]], %x
-; CHECK: %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
-; CHECK: %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
-; CHECK: br label %[[CONT0]]
-; CHECK: [[CONT0]]:
-; CHECK: %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
-; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
-; CHECK: [[IF1]]:
-; CHECK: %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK: %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK: %[[T08:.+]] = add nsw i64 %[[T07]], %x
-; CHECK: %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
-; CHECK: %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
-; CHECK: br label %[[CONT1]]
-; CHECK: [[CONT1]]:
-; CHECK: phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
-; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+; COST-LABEL: predicated_udiv_scalarized_operand
+; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+;
+; CHECK-LABEL: @predicated_udiv_scalarized_operand(
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
+; CHECK: [[PRED_UDIV_IF]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x
+; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
+; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]]
+; CHECK: [[PRED_UDIV_CONTINUE]]:
+; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
+; CHECK: [[PRED_UDIV_IF1]]:
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x
+; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
+; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]]
+; CHECK: [[PRED_UDIV_CONTINUE2]]:
+; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
entry:
br label %for.body
@@ -43,7 +58,8 @@ for.body:
%r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
%tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
%tmp2 = load i64, i64* %tmp0, align 4
- br i1 %c, label %if.then, label %for.inc
+ %cond0 = icmp sgt i64 %tmp2, 0
+ br i1 %cond0, label %if.then, label %for.inc
if.then:
%tmp3 = add nsw i64 %tmp2, %x
@@ -54,8 +70,8 @@ for.inc:
%tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
%tmp6 = add i64 %r, %tmp5
%i.next = add nuw nsw i64 %i, 1
- %cond = icmp slt i64 %i.next, 100
- br i1 %cond, label %for.body, label %for.end
+ %cond1 = icmp slt i64 %i.next, 100
+ br i1 %cond1, label %for.body, label %for.end
for.end:
%tmp7 = phi i64 [ %tmp6, %for.inc ]
diff --git a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
deleted file mode 100644
index fc68adb59df3..000000000000
--- a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ /dev/null
@@ -1,341 +0,0 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: @recurrence_1
-;
-; void recurrence_1(int *a, int *b, int n) {
-; for(int i = 0; i < n; i++)
-; b[i] = a[i] + a[i - 1]
-; }
-;
-; CHECK: vector.ph:
-; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
-;
-; CHECK: vector.body:
-; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK: [[L1]] = load <4 x i32>
-; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK: middle.block:
-; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK: scalar.ph:
-; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
-;
-; CHECK: scalar.body:
-; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL: [[L2]] = load <4 x i32>
-; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
-entry:
- br label %for.preheader
-
-for.preheader:
- %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
- %pre_load = load i32, i32* %arrayidx.phi.trans.insert
- br label %scalar.body
-
-scalar.body:
- %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
- %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
- %1 = load i32, i32* %arrayidx32
- %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
- %add35 = add i32 %1, %0
- store i32 %add35, i32* %arrayidx34
- %lftr.wideiv = trunc i64 %indvars.iv.next to i32
- %exitcond = icmp eq i32 %lftr.wideiv, %n
- br i1 %exitcond, label %for.exit, label %scalar.body
-
-for.exit:
- ret void
-}
-
-; CHECK-LABEL: @recurrence_2
-;
-; int recurrence_2(int *a, int n) {
-; int minmax;
-; for (int i = 0; i < n; ++i)
-; minmax = min(minmax, max(a[i] - a[i-1], 0));
-; return minmax;
-; }
-;
-; CHECK: vector.ph:
-; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
-;
-; CHECK: vector.body:
-; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK: [[L1]] = load <4 x i32>
-; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK: middle.block:
-; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK: scalar.ph:
-; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
-;
-; CHECK: scalar.body:
-; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL: [[L2]] = load <4 x i32>
-; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
-entry:
- %cmp27 = icmp sgt i32 %n, 0
- br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
-
-for.preheader:
- %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
- %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
- br label %scalar.body
-
-for.cond.cleanup.loopexit:
- %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %minmax.0.lcssa
-
-scalar.body:
- %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
- %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
- %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
- %1 = load i32, i32* %arrayidx, align 4
- %sub3 = sub nsw i32 %1, %0
- %cmp4 = icmp sgt i32 %sub3, 0
- %cond = select i1 %cmp4, i32 %sub3, i32 0
- %cmp5 = icmp slt i32 %minmax.028, %cond
- %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %lftr.wideiv = trunc i64 %indvars.iv.next to i32
- %exitcond = icmp eq i32 %lftr.wideiv, %n
- br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
-}
-
-; CHECK-LABEL: @recurrence_3
-;
-; void recurrence_3(short *a, double *b, int n, float f, short p) {
-; b[0] = (double)a[0] - f * (double)p;
-; for (int i = 1; i < n; i++)
-; b[i] = (double)a[i] - f * (double)a[i - 1];
-; }
-;
-;
-; CHECK: vector.ph:
-; CHECK: %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
-;
-; CHECK: vector.body:
-; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK: [[L1]] = load <4 x i16>
-; CHECK: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK: middle.block:
-; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
-;
-; CHECK: scalar.ph:
-; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
-;
-; CHECK: scalar.body:
-; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
-; UNROLL: [[L2]] = load <4 x i16>
-; UNROLL: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL: {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL: %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
-;
-define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
-entry:
- %0 = load i16, i16* %a, align 2
- %conv = sitofp i16 %0 to double
- %conv1 = fpext float %f to double
- %conv2 = sitofp i16 %p to double
- %mul = fmul fast double %conv2, %conv1
- %sub = fsub fast double %conv, %mul
- store double %sub, double* %b, align 8
- %cmp25 = icmp sgt i32 %n, 1
- br i1 %cmp25, label %for.preheader, label %for.end
-
-for.preheader:
- br label %scalar.body
-
-scalar.body:
- %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
- %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
- %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
- %2 = load i16, i16* %arrayidx5, align 2
- %conv6 = sitofp i16 %2 to double
- %conv11 = sitofp i16 %1 to double
- %mul12 = fmul fast double %conv11, %conv1
- %sub13 = fsub fast double %conv6, %mul12
- %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
- store double %sub13, double* %arrayidx15, align 8
- %advars.iv.next = add nuw nsw i64 %advars.iv, 1
- %lftr.wideiv = trunc i64 %advars.iv.next to i32
- %exitcond = icmp eq i32 %lftr.wideiv, %n
- br i1 %exitcond, label %for.end.loopexit, label %scalar.body
-
-for.end.loopexit:
- br label %for.end
-
-for.end:
- ret void
-}
-
-; CHECK-LABEL: @PR26734
-;
-; void PR26734(short *a, int *b, int *c, int d, short *e) {
-; for (; d != 21; d++) {
-; *b &= *c;
-; *e = *a - 6;
-; *c = *e;
-; }
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
-entry:
- %cmp4 = icmp eq i32 %d, 21
- br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
-
-entry.for.end_crit_edge:
- %.pre = load i32, i32* %b, align 4
- br label %for.end
-
-for.body.lr.ph:
- %0 = load i16, i16* %a, align 2
- %sub = add i16 %0, -6
- %conv2 = sext i16 %sub to i32
- %c.promoted = load i32, i32* %c, align 4
- %b.promoted = load i32, i32* %b, align 4
- br label %for.body
-
-for.body:
- %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
- %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
- %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
- %and = and i32 %and6, %conv25
- %inc = add nsw i32 %inc7, 1
- %cmp = icmp eq i32 %inc, 21
- br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
-
-for.cond.for.end_crit_edge:
- %and.lcssa = phi i32 [ %and, %for.body ]
- store i32 %conv2, i32* %c, align 4
- store i32 %and.lcssa, i32* %b, align 4
- store i16 %sub, i16* %e, align 2
- br label %for.end
-
-for.end:
- ret void
-}
-
-; CHECK-LABEL: @PR27246
-;
-; int PR27246() {
-; unsigned int e, n;
-; for (int i = 1; i < 49; ++i) {
-; for (int k = i; k > 1; --k)
-; e = k;
-; n = e;
-; }
-; return n;
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define i32 @PR27246() {
-entry:
- br label %for.cond1.preheader
-
-for.cond1.preheader:
- %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
- %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
- br label %for.cond1
-
-for.cond.cleanup:
- %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
- ret i32 %e.1.lcssa.lcssa
-
-for.cond1:
- %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
- %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
- %cmp2 = icmp sgt i32 %k.0, 1
- %dec = add nsw i32 %k.0, -1
- br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
-
-for.cond.cleanup3:
- %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
- %inc = add nuw nsw i32 %i.016, 1
- %exitcond = icmp eq i32 %inc, 49
- br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
-}
-
-; CHECK-LABEL: @PR29559
-;
-; UNROLL-NO-IC: vector.ph:
-; UNROLL-NO-IC: br label %vector.body
-;
-; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; UNROLL-NO-IC: %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ]
-; UNROLL-NO-IC: %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-; UNROLL-NO-IC: %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0
-; UNROLL-NO-IC: %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1
-; UNROLL-NO-IC: %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2
-; UNROLL-NO-IC: %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3
-; UNROLL-NO-IC: {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC: {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC: %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3
-;
-; UNROLL-NO-IC: scalar.ph:
-; UNROLL-NO-IC: %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ]
-;
-; UNROLL-NO-IC: scalar.body:
-; UNROLL-NO-IC: %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-define void @PR29559() {
-entry:
- br label %scalar.body
-
-scalar.body:
- %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
- %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ]
- %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
- %i.next = add nuw nsw i64 %i, 1
- %cond = icmp eq i64 %i.next, undef
- br i1 %cond, label %for.end, label %scalar.body
-
-for.end:
- ret void
-}
diff --git a/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
new file mode 100644
index 000000000000..e8ef42562356
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @non_primary_iv_trunc_free(
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @non_primary_iv_trunc_free(i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %tmp0 = trunc i64 %i to i32
+ %i.next = add nuw nsw i64 %i, 5
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
new file mode 100644
index 000000000000..0ebb7a92edae
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+ %tmp1 = load i8, i8* %tmp0, align 1
+ %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+ %tmp3 = load i8, i8* %tmp2, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index df1f9c619408..54ee8fc6e73f 100644
--- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -1,81 +1,189 @@
-; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
; REQUIRES: asserts
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnueabi"
-@AB = common global [1024 x i8] zeroinitializer, align 4
-@CD = common global [1024 x i8] zeroinitializer, align 4
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+ br label %for.body
+
+; VF_8-LABEL: Checking a loop in "i8_factor_2"
+; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+ %tmp2 = load i8, i8* %tmp0, align 1
+ %tmp3 = load i8, i8* %tmp1, align 1
+ store i8 0, i8* %tmp0, align 1
+ store i8 0, i8* %tmp1, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+ br label %for.body
+
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "i16_factor_2"
+; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+ %tmp2 = load i16, i16* %tmp0, align 2
+ %tmp3 = load i16, i16* %tmp1, align 2
+ store i16 0, i16* %tmp0, align 2
+ store i16 0, i16* %tmp1, align 2
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
-define void @test_byte_interleaved_cost(i8 %C, i8 %D) {
+for.end:
+ ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
entry:
br label %for.body
-; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
-; access group is 2.
-
-; CHECK: LV: Checking a loop in "test_byte_interleaved_cost"
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4
-
-for.body: ; preds = %for.body, %entry
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv
- %tmp = load i8, i8* %arrayidx0, align 4
- %tmp1 = or i64 %indvars.iv, 1
- %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1
- %tmp2 = load i8, i8* %arrayidx1, align 4
- %add = add nsw i8 %tmp, %C
- %mul = mul nsw i8 %tmp2, %D
- %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv
- store i8 %add, i8* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1
- store i8 %mul, i8* %arrayidx3, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
- %cmp = icmp slt i64 %indvars.iv.next, 1024
- br i1 %cmp, label %for.body, label %for.end
-
-for.end: ; preds = %for.body
+; VF_2-LABEL: Checking a loop in "i32_factor_2"
+; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL: Checking a loop in "i32_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL: Checking a loop in "i32_factor_2"
+; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+ %tmp2 = load i32, i32* %tmp0, align 4
+ %tmp3 = load i32, i32* %tmp1, align 4
+ store i32 0, i32* %tmp0, align 4
+ store i32 0, i32* %tmp1, align 4
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
ret void
}
-%ig.factor.8 = type { double*, double, double, double, double, double, double, double }
-define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) {
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) {
entry:
br label %for.body
-; Check the default cost of a strided load with a factor that is greater than
-; the maximum allowed. In this test, the interleave factor would be 8, which is
-; not supported.
+; VF_2-LABEL: Checking a loop in "i64_factor_2"
+; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL: Checking a loop in "i64_factor_2"
+; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL: Checking a loop in "i64_factor_2"
+; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+ %tmp2 = load i64, i64* %tmp0, align 8
+ %tmp3 = load i64, i64* %tmp1, align 8
+ store i64 0, i64* %tmp0, align 8
+ store i64 0, i64* %tmp1, align 8
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
-; CHECK: LV: Checking a loop in "wide_interleaved_group"
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %1 = load double, double* %0, align 8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %5 = load double, double* %4, align 8
-; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store double %9, double* %10, align 8
+%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
+define void @i64_factor_8(%i64.8* %data, i64 %n) {
+entry:
+ br label %for.body
+; The interleave factor in this test is 8, which is greater than the maximum
+; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
+; implementation for determining the cost of the interleaved load group. The
+; stores do not form a legal interleaved group because the group would contain
+; gaps.
+;
+; VF_2-LABEL: Checking a loop in "i64_factor_8"
+; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
- %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ]
- %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2
- %1 = load double, double* %0, align 8
- %2 = fcmp fast olt double %1, %a
- %3 = select i1 %2, double 0.000000e+00, double %1
- %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6
- %5 = load double, double* %4, align 8
- %6 = fcmp fast olt double %5, %a
- %7 = select i1 %6, double 0.000000e+00, double %5
- %8 = fmul fast double %7, %b
- %9 = fadd fast double %8, %3
- %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3
- store double %9, double* %10, align 8
- %11 = fmul fast double %9, %9
- %12 = fadd fast double %11, %r
+ %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
+ %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
+ %tmp2 = load i64, i64* %tmp0, align 8
+ %tmp3 = load i64, i64* %tmp1, align 8
+ store i64 0, i64* %tmp0, align 8
+ store i64 0, i64* %tmp1, align 8
%i.next = add nuw nsw i64 %i, 1
- %13 = trunc i64 %i.next to i32
- %cond = icmp eq i32 %13, %n
- br i1 %cond, label %for.exit, label %for.body
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
-for.exit:
- %r.lcssa = phi double [ %12, %for.body ]
- ret double %r.lcssa
+for.end:
+ ret void
}
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index c7ced757581a..d06e3fdba39c 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -234,12 +234,27 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
-; CHECK-LABEL: @add_phifail2(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i32>
-; CHECK: store <16 x i8>
; Function Attrs: nounwind
+; When we vectorize this loop, we generate correct code
+; even when %len exactly divides VF (since we extract from the second last index
+; and pass this to the for.cond.cleanup block). Vectorized loop returns
+; the correct value a_phi = p[len -2]
define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+; CHECK-LABEL: @add_phifail2(
+; CHECK: vector.body:
+; CHECK: %wide.load = load <16 x i8>, <16 x i8>*
+; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
+; CHECK: add nuw nsw <16 x i32>
+; CHECK: store <16 x i8>
+; CHECK: add i64 %index, 16
+; CHECK: icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
+; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
+; CHECK: for.cond.cleanup:
+; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK: %ret = trunc i32 %a_phi.lcssa to i8
+; CHECK: ret i8 %ret
entry:
br label %for.body
diff --git a/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
new file mode 100644
index 000000000000..5ea38a4a246d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
+
+; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
+; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
+; datalayout to reproduce the fault.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
+
+; Check that all the loads are scalarized
+; CHECK: load i16, i16*
+; CHECK: load i16, i16*
+; CHECK: load i16, i16 addrspace(4)*
+; CHECK: load i16, i16 addrspace(4)*
+
+%rec1445 = type { i16, i16, i16, i16, i16 }
+
+define void @foo() {
+bb1:
+ br label %bb4
+
+bb4:
+ %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
+ %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
+ %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
+ %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
+ %_tmp987 = load i16, i16* %0, align 1
+ %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
+ %_tmp993 = load i16, i16 addrspace(4)* %1, align 1
+ %_tmp1013 = add i16 %tmp1, 1
+ %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
+ %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
+ %_tmp1019 = icmp ult i16 %_tmp1013, 24
+ br i1 %_tmp1019, label %bb4, label %bb16
+
+bb16:
+ unreachable
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
new file mode 100644
index 000000000000..1ae7dadeffd7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK: The Smallest and Widest types: 64 / 64 bits
+;
+define void @interleaved_access(i8** %A, i64 %N) {
+for.ph:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
+ %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
+ store i8* null, i8** %tmp0, align 8
+ %i.next.0 = add nuw nsw i64 %i, 1
+ %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
+ store i8* null, i8** %tmp1, align 8
+ %i.next.1 = add nsw i64 %i, 2
+ %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
+ store i8* null, i8** %tmp2, align 8
+ %i.next.2 = add nsw i64 %i, 3
+ %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
+ store i8* null, i8** %tmp3, align 8
+ %i.next.3 = add nsw i64 %i, 4
+ %cond = icmp slt i64 %i.next.3, %N
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}