diff options
Diffstat (limited to 'test/Transforms/LoopVectorize/AArch64')
8 files changed, 373 insertions, 437 deletions
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index 21b59f87d042..37a6d4e79984 100644 --- a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -1,40 +1,55 @@ -; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s -; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -; CHECK-LABEL: predicated_udiv_scalarized_operand -; ; This test checks that we correctly compute the scalarized operands for a ; user-specified vectorization factor when interleaving is disabled. We use the -; "optsize" attribute to disable all interleaving calculations. +; "optsize" attribute to disable all interleaving calculations. A cost of 4 +; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving +; %tmp4 a lower scalarization overhead. ; -; CHECK: vector.body: -; CHECK: %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4 -; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] -; CHECK: [[IF0]]: -; CHECK: %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0 -; CHECK: %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0 -; CHECK: %[[T02:.+]] = add nsw i64 %[[T01]], %x -; CHECK: %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]] -; CHECK: %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0 -; CHECK: br label %[[CONT0]] -; CHECK: [[CONT0]]: -; CHECK: %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ] -; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] -; CHECK: [[IF1]]: -; CHECK: %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1 -; CHECK: %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1 -; CHECK: %[[T08:.+]] = add nsw i64 %[[T07]], %x -; CHECK: %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]] -; CHECK: %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1 -; CHECK: br label %[[CONT1]] -; CHECK: [[CONT1]]: -; CHECK: phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ] -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body - -define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize { +; COST-LABEL: predicated_udiv_scalarized_operand +; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; +; CHECK-LABEL: @predicated_udiv_scalarized_operand( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] +; CHECK: [[PRED_UDIV_IF]]: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 +; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] +; CHECK: [[PRED_UDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] +; CHECK: [[PRED_UDIV_IF1]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] +; CHECK: [[PRED_UDIV_CONTINUE2]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize { entry: br label %for.body @@ -43,7 +58,8 @@ for.body: %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ] %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i %tmp2 = load i64, i64* %tmp0, align 4 - br i1 %c, label %if.then, label %for.inc + %cond0 = icmp sgt i64 %tmp2, 0 + br i1 %cond0, label %if.then, label %for.inc if.then: %tmp3 = add nsw i64 %tmp2, %x @@ -54,8 +70,8 @@ for.inc: %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then] %tmp6 = add i64 %r, %tmp5 %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, 100 - br i1 %cond, label %for.body, label %for.end + %cond1 = icmp slt i64 %i.next, 100 + br i1 %cond1, label %for.body, label %for.end for.end: %tmp7 = phi i64 [ %tmp6, %for.inc ] diff --git a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll deleted file mode 100644 index fc68adb59df3..000000000000 --- a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll +++ /dev/null @@ -1,341 +0,0 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC - -target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" - -; CHECK-LABEL: @recurrence_1 -; -; void recurrence_1(int *a, int *b, int n) { -; for(int i = 0; i < n; i++) -; b[i] = a[i] + a[i - 1] -; } -; -; CHECK: vector.ph: -; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3 -; -; CHECK: vector.body: -; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: [[L1]] = load <4 x i32> -; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; CHECK: middle.block: -; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 -; -; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ] -; -; CHECK: scalar.body: -; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -; UNROLL: vector.body: -; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] -; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> -; UNROLL: [[L2]] = load <4 x i32> -; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL: middle.block: -; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 -; -define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { -entry: - br label %for.preheader - -for.preheader: - %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 - %pre_load = load i32, i32* %arrayidx.phi.trans.insert - br label %scalar.body - -scalar.body: - %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] - %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next - %1 = load i32, i32* %arrayidx32 - %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv - %add35 = add i32 %1, %0 - store i32 %add35, i32* %arrayidx34 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.exit, label %scalar.body - -for.exit: - ret void -} - -; CHECK-LABEL: @recurrence_2 -; -; int recurrence_2(int *a, int n) { -; int minmax; -; for (int i = 0; i < n; ++i) -; minmax = min(minmax, max(a[i] - a[i-1], 0)); -; return minmax; -; } -; -; CHECK: vector.ph: -; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 -; -; CHECK: vector.body: -; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: [[L1]] = load <4 x i32> -; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; CHECK: middle.block: -; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 -; -; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ] -; -; CHECK: scalar.body: -; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -; UNROLL: vector.body: -; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] -; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> -; UNROLL: [[L2]] = load <4 x i32> -; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL: middle.block: -; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 -; -define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) { -entry: - %cmp27 = icmp sgt i32 %n, 0 - br i1 %cmp27, label %for.preheader, label %for.cond.cleanup - -for.preheader: - %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 - %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 - br label %scalar.body - -for.cond.cleanup.loopexit: - %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ] - br label %for.cond.cleanup - -for.cond.cleanup: - %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %minmax.0.lcssa - -scalar.body: - %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ] - %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] - %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - %1 = load i32, i32* %arrayidx, align 4 - %sub3 = sub nsw i32 %1, %0 - %cmp4 = icmp sgt i32 %sub3, 0 - %cond = select i1 %cmp4, i32 %sub3, i32 0 - %cmp5 = icmp slt i32 %minmax.028, %cond - %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body -} - -; CHECK-LABEL: @recurrence_3 -; -; void recurrence_3(short *a, double *b, int n, float f, short p) { -; b[0] = (double)a[0] - f * (double)p; -; for (int i = 1; i < n; i++) -; b[i] = (double)a[i] - f * (double)a[i - 1]; -; } -; -; -; CHECK: vector.ph: -; CHECK: %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3 -; -; CHECK: vector.body: -; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: [[L1]] = load <4 x i16> -; CHECK: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; CHECK: middle.block: -; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 -; -; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ] -; -; CHECK: scalar.body: -; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -; UNROLL: vector.body: -; UNROLL: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] -; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16> -; UNROLL: [[L2]] = load <4 x i16> -; UNROLL: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL: {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL: middle.block: -; UNROLL: %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3 -; -define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { -entry: - %0 = load i16, i16* %a, align 2 - %conv = sitofp i16 %0 to double - %conv1 = fpext float %f to double - %conv2 = sitofp i16 %p to double - %mul = fmul fast double %conv2, %conv1 - %sub = fsub fast double %conv, %mul - store double %sub, double* %b, align 8 - %cmp25 = icmp sgt i32 %n, 1 - br i1 %cmp25, label %for.preheader, label %for.end - -for.preheader: - br label %scalar.body - -scalar.body: - %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ] - %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ] - %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv - %2 = load i16, i16* %arrayidx5, align 2 - %conv6 = sitofp i16 %2 to double - %conv11 = sitofp i16 %1 to double - %mul12 = fmul fast double %conv11, %conv1 - %sub13 = fsub fast double %conv6, %mul12 - %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv - store double %sub13, double* %arrayidx15, align 8 - %advars.iv.next = add nuw nsw i64 %advars.iv, 1 - %lftr.wideiv = trunc i64 %advars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.end.loopexit, label %scalar.body - -for.end.loopexit: - br label %for.end - -for.end: - ret void -} - -; CHECK-LABEL: @PR26734 -; -; void PR26734(short *a, int *b, int *c, int d, short *e) { -; for (; d != 21; d++) { -; *b &= *c; -; *e = *a - 6; -; *c = *e; -; } -; } -; -; CHECK-NOT: vector.ph: -; -define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) { -entry: - %cmp4 = icmp eq i32 %d, 21 - br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph - -entry.for.end_crit_edge: - %.pre = load i32, i32* %b, align 4 - br label %for.end - -for.body.lr.ph: - %0 = load i16, i16* %a, align 2 - %sub = add i16 %0, -6 - %conv2 = sext i16 %sub to i32 - %c.promoted = load i32, i32* %c, align 4 - %b.promoted = load i32, i32* %b, align 4 - br label %for.body - -for.body: - %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ] - %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ] - %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ] - %and = and i32 %and6, %conv25 - %inc = add nsw i32 %inc7, 1 - %cmp = icmp eq i32 %inc, 21 - br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body - -for.cond.for.end_crit_edge: - %and.lcssa = phi i32 [ %and, %for.body ] - store i32 %conv2, i32* %c, align 4 - store i32 %and.lcssa, i32* %b, align 4 - store i16 %sub, i16* %e, align 2 - br label %for.end - -for.end: - ret void -} - -; CHECK-LABEL: @PR27246 -; -; int PR27246() { -; unsigned int e, n; -; for (int i = 1; i < 49; ++i) { -; for (int k = i; k > 1; --k) -; e = k; -; n = e; -; } -; return n; -; } -; -; CHECK-NOT: vector.ph: -; -define i32 @PR27246() { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ] - %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ] - br label %for.cond1 - -for.cond.cleanup: - %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ] - ret i32 %e.1.lcssa.lcssa - -for.cond1: - %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ] - %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ] - %cmp2 = icmp sgt i32 %k.0, 1 - %dec = add nsw i32 %k.0, -1 - br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3 - -for.cond.cleanup3: - %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ] - %inc = add nuw nsw i32 %i.016, 1 - %exitcond = icmp eq i32 %inc, 49 - br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader -} - -; CHECK-LABEL: @PR29559 -; -; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC: br label %vector.body -; -; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; UNROLL-NO-IC: %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ] -; UNROLL-NO-IC: %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0 -; UNROLL-NO-IC: %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0 -; UNROLL-NO-IC: %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1 -; UNROLL-NO-IC: %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2 -; UNROLL-NO-IC: %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3 -; UNROLL-NO-IC: {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL-NO-IC: {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC: %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3 -; -; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC: %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ] -; -; UNROLL-NO-IC: scalar.body: -; UNROLL-NO-IC: %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -define void @PR29559() { -entry: - br label %scalar.body - -scalar.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ] - %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ] - %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp eq i64 %i.next, undef - br i1 %cond, label %for.end, label %scalar.body - -for.end: - ret void -} diff --git a/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll new file mode 100644 index 000000000000..e8ef42562356 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @non_primary_iv_trunc_free( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @non_primary_iv_trunc_free(i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = trunc i64 %i to i32 + %i.next = add nuw nsw i64 %i, 5 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll new file mode 100644 index 000000000000..0ebb7a92edae --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -0,0 +1,38 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s + +; This test shows extremely high interleaving cost that, probably, should be fixed. +; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize +; the load instructions. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%pair = type { i8, i8 } + +; CHECK-LABEL: test +; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: vector.body +; CHECK: load i8 +; CHECK: load i8 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @test(%pair* %p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0 + %tmp1 = load i8, i8* %tmp0, align 1 + %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1 + %tmp3 = load i8, i8* %tmp2, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index df1f9c619408..54ee8fc6e73f 100644 --- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -1,81 +1,189 @@ -; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" -@AB = common global [1024 x i8] zeroinitializer, align 4 -@CD = common global [1024 x i8] zeroinitializer, align 4 +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i16.2 = type {i16, i16} +define void @i16_factor_2(%i16.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1 + %tmp2 = load i16, i16* %tmp0, align 2 + %tmp3 = load i16, i16* %tmp1, align 2 + store i16 0, i16* %tmp0, align 2 + store i16 0, i16* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +for.end: + ret void +} + +%i32.2 = type {i32, i32} +define void @i32_factor_2(%i32.2* %data, i64 %n) { entry: br label %for.body -; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved -; access group is 2. - -; CHECK: LV: Checking a loop in "test_byte_interleaved_cost" -; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 -; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv - %tmp = load i8, i8* %arrayidx0, align 4 - %tmp1 = or i64 %indvars.iv, 1 - %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 - %tmp2 = load i8, i8* %arrayidx1, align 4 - %add = add nsw i8 %tmp, %C - %mul = mul nsw i8 %tmp2, %D - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv - store i8 %add, i8* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 - store i8 %mul, i8* %arrayidx3, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp slt i64 %indvars.iv.next, 1024 - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1 + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp0, align 4 + store i32 0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ret void } -%ig.factor.8 = type { double*, double, double, double, double, double, double, double } -define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) { +%i64.2 = type {i64, i64} +define void @i64_factor_2(%i64.2* %data, i64 %n) { entry: br label %for.body -; Check the default cost of a strided load with a factor that is greater than -; the maximum allowed. In this test, the interleave factor would be 8, which is -; not supported. +; VF_2-LABEL: Checking a loop in "i64_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_4-LABEL: Checking a loop in "i64_factor_2" +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_8-LABEL: Checking a loop in "i64_factor_2" +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_16-LABEL: Checking a loop in "i64_factor_2" +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} -; CHECK: LV: Checking a loop in "wide_interleaved_group" -; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %1 = load double, double* %0, align 8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %5 = load double, double* %4, align 8 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store double %9, double* %10, align 8 +%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64} +define void @i64_factor_8(%i64.8* %data, i64 %n) { +entry: + br label %for.body +; The interleave factor in this test is 8, which is greater than the maximum +; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI +; implementation for determining the cost of the interleaved load group. The +; stores do not form a legal interleaved group because the group would contain +; gaps. +; +; VF_2-LABEL: Checking a loop in "i64_factor_8" +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ] - %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2 - %1 = load double, double* %0, align 8 - %2 = fcmp fast olt double %1, %a - %3 = select i1 %2, double 0.000000e+00, double %1 - %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6 - %5 = load double, double* %4, align 8 - %6 = fcmp fast olt double %5, %a - %7 = select i1 %6, double 0.000000e+00, double %5 - %8 = fmul fast double %7, %b - %9 = fadd fast double %8, %3 - %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3 - store double %9, double* %10, align 8 - %11 = fmul fast double %9, %9 - %12 = fadd fast double %11, %r + %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 + %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 %i.next = add nuw nsw i64 %i, 1 - %13 = trunc i64 %i.next to i32 - %cond = icmp eq i32 %13, %n - br i1 %cond, label %for.exit, label %for.body + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -for.exit: - %r.lcssa = phi double [ %12, %for.body ] - ret double %r.lcssa +for.end: + ret void } diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index c7ced757581a..d06e3fdba39c 100644 --- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -234,12 +234,27 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: @add_phifail2( -; CHECK: load <16 x i8>, <16 x i8>* -; CHECK: add nuw nsw <16 x i32> -; CHECK: store <16 x i8> ; Function Attrs: nounwind +; When we vectorize this loop, we generate correct code +; even when %len exactly divides VF (since we extract from the second last index +; and pass this to the for.cond.cleanup block). Vectorized loop returns +; the correct value a_phi = p[len -2] define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +; CHECK-LABEL: @add_phifail2( +; CHECK: vector.body: +; CHECK: %wide.load = load <16 x i8>, <16 x i8>* +; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32> +; CHECK: add nuw nsw <16 x i32> +; CHECK: store <16 x i8> +; CHECK: add i64 %index, 16 +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15 +; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14 +; CHECK: for.cond.cleanup: +; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %ret = trunc i32 %a_phi.lcssa to i8 +; CHECK: ret i8 %ret entry: br label %for.body diff --git a/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/test/Transforms/LoopVectorize/AArch64/pr31900.ll new file mode 100644 index 000000000000..5ea38a4a246d --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/pr31900.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s + +; Reproducer for address space fault in the LoopVectorizer (pr31900). Added +; different sized address space pointers (p:16:16-p4:32:16) to the aarch64 +; datalayout to reproduce the fault. + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16" + +; Check that all the loads are scalarized +; CHECK: load i16, i16* +; CHECK: load i16, i16* +; CHECK: load i16, i16 addrspace(4)* +; CHECK: load i16, i16 addrspace(4)* + +%rec1445 = type { i16, i16, i16, i16, i16 } + +define void @foo() { +bb1: + br label %bb4 + +bb4: + %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ] + %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ] + %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ] + %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1 + %_tmp987 = load i16, i16* %0, align 1 + %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1 + %_tmp993 = load i16, i16 addrspace(4)* %1, align 1 + %_tmp1013 = add i16 %tmp1, 1 + %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1 + %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1 + %_tmp1019 = icmp ult i16 %_tmp1013, 24 + br i1 %_tmp1019, label %bb4, label %bb16 + +bb16: + unreachable +} diff --git a/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll new file mode 100644 index 000000000000..1ae7dadeffd7 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: Checking a loop in "interleaved_access" +; CHECK: The Smallest and Widest types: 64 / 64 bits +; +define void @interleaved_access(i8** %A, i64 %N) { +for.ph: + br label %for.body + +for.body: + %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ] + %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i + store i8* null, i8** %tmp0, align 8 + %i.next.0 = add nuw nsw i64 %i, 1 + %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0 + store i8* null, i8** %tmp1, align 8 + %i.next.1 = add nsw i64 %i, 2 + %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1 + store i8* null, i8** %tmp2, align 8 + %i.next.2 = add nsw i64 %i, 3 + %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2 + store i8* null, i8** %tmp3, align 8 + %i.next.3 = add nsw i64 %i, 4 + %cond = icmp slt i64 %i.next.3, %N + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} |