diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:01:22 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:01:22 +0000 |
commit | 71d5a2540a98c81f5bcaeb48805e0e2881f530ef (patch) | |
tree | 5343938942df402b49ec7300a1c25a2d4ccd5821 /test/Transforms/LoopVectorize | |
parent | 31bbf64f3a4974a2d6c8b3b27ad2f519caf74057 (diff) |
Diffstat (limited to 'test/Transforms/LoopVectorize')
43 files changed, 2203 insertions, 679 deletions
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index 21b59f87d042..37a6d4e79984 100644 --- a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -1,40 +1,55 @@ -; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s -; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -; CHECK-LABEL: predicated_udiv_scalarized_operand -; ; This test checks that we correctly compute the scalarized operands for a ; user-specified vectorization factor when interleaving is disabled. We use the -; "optsize" attribute to disable all interleaving calculations. +; "optsize" attribute to disable all interleaving calculations. A cost of 4 +; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving +; %tmp4 a lower scalarization overhead. ; -; CHECK: vector.body: -; CHECK: %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4 -; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]] -; CHECK: [[IF0]]: -; CHECK: %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0 -; CHECK: %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0 -; CHECK: %[[T02:.+]] = add nsw i64 %[[T01]], %x -; CHECK: %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]] -; CHECK: %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0 -; CHECK: br label %[[CONT0]] -; CHECK: [[CONT0]]: -; CHECK: %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ] -; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]] -; CHECK: [[IF1]]: -; CHECK: %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1 -; CHECK: %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1 -; CHECK: %[[T08:.+]] = add nsw i64 %[[T07]], %x -; CHECK: %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]] -; CHECK: %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1 -; CHECK: br label %[[CONT1]] -; CHECK: [[CONT1]]: -; CHECK: phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ] -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body - -define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize { +; COST-LABEL: predicated_udiv_scalarized_operand +; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; +; CHECK-LABEL: @predicated_udiv_scalarized_operand( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] +; CHECK: [[PRED_UDIV_IF]]: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0 +; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] +; CHECK: [[PRED_UDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] +; CHECK: [[PRED_UDIV_IF1]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x +; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] +; CHECK: [[PRED_UDIV_CONTINUE2]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize { entry: br label %for.body @@ -43,7 +58,8 @@ for.body: %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ] %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i %tmp2 = load i64, i64* %tmp0, align 4 - br i1 %c, label %if.then, label %for.inc + %cond0 = icmp sgt i64 %tmp2, 0 + br i1 %cond0, label %if.then, label %for.inc if.then: %tmp3 = add nsw i64 %tmp2, %x @@ -54,8 +70,8 @@ for.inc: %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then] %tmp6 = add i64 %r, %tmp5 %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, 100 - br i1 %cond, label %for.body, label %for.end + %cond1 = icmp slt i64 %i.next, 100 + br i1 %cond1, label %for.body, label %for.end for.end: %tmp7 = phi i64 [ %tmp6, %for.inc ] diff --git a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll deleted file mode 100644 index fc68adb59df3..000000000000 --- a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll +++ /dev/null @@ -1,341 +0,0 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC - -target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" - -; CHECK-LABEL: @recurrence_1 -; -; void recurrence_1(int *a, int *b, int n) { -; for(int i = 0; i < n; i++) -; b[i] = a[i] + a[i - 1] -; } -; -; CHECK: vector.ph: -; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3 -; -; CHECK: vector.body: -; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: [[L1]] = load <4 x i32> -; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; CHECK: middle.block: -; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 -; -; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ] -; -; CHECK: scalar.body: -; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -; UNROLL: vector.body: -; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] -; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> -; UNROLL: [[L2]] = load <4 x i32> -; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL: middle.block: -; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 -; -define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { -entry: - br label %for.preheader - -for.preheader: - %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 - %pre_load = load i32, i32* %arrayidx.phi.trans.insert - br label %scalar.body - -scalar.body: - %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] - %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next - %1 = load i32, i32* %arrayidx32 - %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv - %add35 = add i32 %1, %0 - store i32 %add35, i32* %arrayidx34 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.exit, label %scalar.body - -for.exit: - ret void -} - -; CHECK-LABEL: @recurrence_2 -; -; int recurrence_2(int *a, int n) { -; int minmax; -; for (int i = 0; i < n; ++i) -; minmax = min(minmax, max(a[i] - a[i-1], 0)); -; return minmax; -; } -; -; CHECK: vector.ph: -; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 -; -; CHECK: vector.body: -; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: [[L1]] = load <4 x i32> -; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; CHECK: middle.block: -; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 -; -; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ] -; -; CHECK: scalar.body: -; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -; UNROLL: vector.body: -; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] -; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> -; UNROLL: [[L2]] = load <4 x i32> -; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL: middle.block: -; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 -; -define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) { -entry: - %cmp27 = icmp sgt i32 %n, 0 - br i1 %cmp27, label %for.preheader, label %for.cond.cleanup - -for.preheader: - %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 - %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 - br label %scalar.body - -for.cond.cleanup.loopexit: - %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ] - br label %for.cond.cleanup - -for.cond.cleanup: - %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %minmax.0.lcssa - -scalar.body: - %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ] - %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] - %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ] - %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv - %1 = load i32, i32* %arrayidx, align 4 - %sub3 = sub nsw i32 %1, %0 - %cmp4 = icmp sgt i32 %sub3, 0 - %cond = select i1 %cmp4, i32 %sub3, i32 0 - %cmp5 = icmp slt i32 %minmax.028, %cond - %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body -} - -; CHECK-LABEL: @recurrence_3 -; -; void recurrence_3(short *a, double *b, int n, float f, short p) { -; b[0] = (double)a[0] - f * (double)p; -; for (int i = 1; i < n; i++) -; b[i] = (double)a[i] - f * (double)a[i - 1]; -; } -; -; -; CHECK: vector.ph: -; CHECK: %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3 -; -; CHECK: vector.body: -; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: [[L1]] = load <4 x i16> -; CHECK: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; CHECK: middle.block: -; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 -; -; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ] -; -; CHECK: scalar.body: -; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -; UNROLL: vector.body: -; UNROLL: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] -; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16> -; UNROLL: [[L2]] = load <4 x i16> -; UNROLL: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL: {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL: middle.block: -; UNROLL: %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3 -; -define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { -entry: - %0 = load i16, i16* %a, align 2 - %conv = sitofp i16 %0 to double - %conv1 = fpext float %f to double - %conv2 = sitofp i16 %p to double - %mul = fmul fast double %conv2, %conv1 - %sub = fsub fast double %conv, %mul - store double %sub, double* %b, align 8 - %cmp25 = icmp sgt i32 %n, 1 - br i1 %cmp25, label %for.preheader, label %for.end - -for.preheader: - br label %scalar.body - -scalar.body: - %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ] - %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ] - %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv - %2 = load i16, i16* %arrayidx5, align 2 - %conv6 = sitofp i16 %2 to double - %conv11 = sitofp i16 %1 to double - %mul12 = fmul fast double %conv11, %conv1 - %sub13 = fsub fast double %conv6, %mul12 - %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv - store double %sub13, double* %arrayidx15, align 8 - %advars.iv.next = add nuw nsw i64 %advars.iv, 1 - %lftr.wideiv = trunc i64 %advars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %for.end.loopexit, label %scalar.body - -for.end.loopexit: - br label %for.end - -for.end: - ret void -} - -; CHECK-LABEL: @PR26734 -; -; void PR26734(short *a, int *b, int *c, int d, short *e) { -; for (; d != 21; d++) { -; *b &= *c; -; *e = *a - 6; -; *c = *e; -; } -; } -; -; CHECK-NOT: vector.ph: -; -define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) { -entry: - %cmp4 = icmp eq i32 %d, 21 - br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph - -entry.for.end_crit_edge: - %.pre = load i32, i32* %b, align 4 - br label %for.end - -for.body.lr.ph: - %0 = load i16, i16* %a, align 2 - %sub = add i16 %0, -6 - %conv2 = sext i16 %sub to i32 - %c.promoted = load i32, i32* %c, align 4 - %b.promoted = load i32, i32* %b, align 4 - br label %for.body - -for.body: - %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ] - %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ] - %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ] - %and = and i32 %and6, %conv25 - %inc = add nsw i32 %inc7, 1 - %cmp = icmp eq i32 %inc, 21 - br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body - -for.cond.for.end_crit_edge: - %and.lcssa = phi i32 [ %and, %for.body ] - store i32 %conv2, i32* %c, align 4 - store i32 %and.lcssa, i32* %b, align 4 - store i16 %sub, i16* %e, align 2 - br label %for.end - -for.end: - ret void -} - -; CHECK-LABEL: @PR27246 -; -; int PR27246() { -; unsigned int e, n; -; for (int i = 1; i < 49; ++i) { -; for (int k = i; k > 1; --k) -; e = k; -; n = e; -; } -; return n; -; } -; -; CHECK-NOT: vector.ph: -; -define i32 @PR27246() { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ] - %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ] - br label %for.cond1 - -for.cond.cleanup: - %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ] - ret i32 %e.1.lcssa.lcssa - -for.cond1: - %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ] - %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ] - %cmp2 = icmp sgt i32 %k.0, 1 - %dec = add nsw i32 %k.0, -1 - br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3 - -for.cond.cleanup3: - %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ] - %inc = add nuw nsw i32 %i.016, 1 - %exitcond = icmp eq i32 %inc, 49 - br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader -} - -; CHECK-LABEL: @PR29559 -; -; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC: br label %vector.body -; -; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; UNROLL-NO-IC: %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ] -; UNROLL-NO-IC: %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0 -; UNROLL-NO-IC: %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0 -; UNROLL-NO-IC: %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1 -; UNROLL-NO-IC: %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2 -; UNROLL-NO-IC: %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3 -; UNROLL-NO-IC: {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; UNROLL-NO-IC: {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> -; -; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC: %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3 -; -; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC: %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ] -; -; UNROLL-NO-IC: scalar.body: -; UNROLL-NO-IC: %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] -; -define void @PR29559() { -entry: - br label %scalar.body - -scalar.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ] - %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ] - %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp eq i64 %i.next, undef - br i1 %cond, label %for.end, label %scalar.body - -for.end: - ret void -} diff --git a/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll new file mode 100644 index 000000000000..e8ef42562356 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @non_primary_iv_trunc_free( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @non_primary_iv_trunc_free(i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = trunc i64 %i to i32 + %i.next = add nuw nsw i64 %i, 5 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll new file mode 100644 index 000000000000..0ebb7a92edae --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -0,0 +1,38 @@ +; REQUIRES: asserts +; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s + +; This test shows extremely high interleaving cost that, probably, should be fixed. +; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize +; the load instructions. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%pair = type { i8, i8 } + +; CHECK-LABEL: test +; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: vector.body +; CHECK: load i8 +; CHECK: load i8 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @test(%pair* %p, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0 + %tmp1 = load i8, i8* %tmp0, align 1 + %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1 + %tmp3 = load i8, i8* %tmp2, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index df1f9c619408..54ee8fc6e73f 100644 --- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -1,81 +1,189 @@ -; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" -@AB = common global [1024 x i8] zeroinitializer, align 4 -@CD = common global [1024 x i8] zeroinitializer, align 4 +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i16.2 = type {i16, i16} +define void @i16_factor_2(%i16.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1 + %tmp2 = load i16, i16* %tmp0, align 2 + %tmp3 = load i16, i16* %tmp1, align 2 + store i16 0, i16* %tmp0, align 2 + store i16 0, i16* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +for.end: + ret void +} + +%i32.2 = type {i32, i32} +define void @i32_factor_2(%i32.2* %data, i64 %n) { entry: br label %for.body -; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved -; access group is 2. - -; CHECK: LV: Checking a loop in "test_byte_interleaved_cost" -; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 -; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv - %tmp = load i8, i8* %arrayidx0, align 4 - %tmp1 = or i64 %indvars.iv, 1 - %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 - %tmp2 = load i8, i8* %arrayidx1, align 4 - %add = add nsw i8 %tmp, %C - %mul = mul nsw i8 %tmp2, %D - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv - store i8 %add, i8* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 - store i8 %mul, i8* %arrayidx3, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp slt i64 %indvars.iv.next, 1024 - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1 + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp0, align 4 + store i32 0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ret void } -%ig.factor.8 = type { double*, double, double, double, double, double, double, double } -define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) { +%i64.2 = type {i64, i64} +define void @i64_factor_2(%i64.2* %data, i64 %n) { entry: br label %for.body -; Check the default cost of a strided load with a factor that is greater than -; the maximum allowed. In this test, the interleave factor would be 8, which is -; not supported. +; VF_2-LABEL: Checking a loop in "i64_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_4-LABEL: Checking a loop in "i64_factor_2" +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_8-LABEL: Checking a loop in "i64_factor_2" +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_16-LABEL: Checking a loop in "i64_factor_2" +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} -; CHECK: LV: Checking a loop in "wide_interleaved_group" -; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %1 = load double, double* %0, align 8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %5 = load double, double* %4, align 8 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store double %9, double* %10, align 8 +%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64} +define void @i64_factor_8(%i64.8* %data, i64 %n) { +entry: + br label %for.body +; The interleave factor in this test is 8, which is greater than the maximum +; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI +; implementation for determining the cost of the interleaved load group. The +; stores do not form a legal interleaved group because the group would contain +; gaps. +; +; VF_2-LABEL: Checking a loop in "i64_factor_8" +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ] - %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2 - %1 = load double, double* %0, align 8 - %2 = fcmp fast olt double %1, %a - %3 = select i1 %2, double 0.000000e+00, double %1 - %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6 - %5 = load double, double* %4, align 8 - %6 = fcmp fast olt double %5, %a - %7 = select i1 %6, double 0.000000e+00, double %5 - %8 = fmul fast double %7, %b - %9 = fadd fast double %8, %3 - %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3 - store double %9, double* %10, align 8 - %11 = fmul fast double %9, %9 - %12 = fadd fast double %11, %r + %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 + %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 %i.next = add nuw nsw i64 %i, 1 - %13 = trunc i64 %i.next to i32 - %cond = icmp eq i32 %13, %n - br i1 %cond, label %for.exit, label %for.body + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -for.exit: - %r.lcssa = phi double [ %12, %for.body ] - ret double %r.lcssa +for.end: + ret void } diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index c7ced757581a..d06e3fdba39c 100644 --- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -234,12 +234,27 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: @add_phifail2( -; CHECK: load <16 x i8>, <16 x i8>* -; CHECK: add nuw nsw <16 x i32> -; CHECK: store <16 x i8> ; Function Attrs: nounwind +; When we vectorize this loop, we generate correct code +; even when %len exactly divides VF (since we extract from the second last index +; and pass this to the for.cond.cleanup block). Vectorized loop returns +; the correct value a_phi = p[len -2] define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +; CHECK-LABEL: @add_phifail2( +; CHECK: vector.body: +; CHECK: %wide.load = load <16 x i8>, <16 x i8>* +; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32> +; CHECK: add nuw nsw <16 x i32> +; CHECK: store <16 x i8> +; CHECK: add i64 %index, 16 +; CHECK: icmp eq i64 %index.next, %n.vec +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15 +; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14 +; CHECK: for.cond.cleanup: +; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %ret = trunc i32 %a_phi.lcssa to i8 +; CHECK: ret i8 %ret entry: br label %for.body diff --git a/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/test/Transforms/LoopVectorize/AArch64/pr31900.ll new file mode 100644 index 000000000000..5ea38a4a246d --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/pr31900.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s + +; Reproducer for address space fault in the LoopVectorizer (pr31900). Added +; different sized address space pointers (p:16:16-p4:32:16) to the aarch64 +; datalayout to reproduce the fault. + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16" + +; Check that all the loads are scalarized +; CHECK: load i16, i16* +; CHECK: load i16, i16* +; CHECK: load i16, i16 addrspace(4)* +; CHECK: load i16, i16 addrspace(4)* + +%rec1445 = type { i16, i16, i16, i16, i16 } + +define void @foo() { +bb1: + br label %bb4 + +bb4: + %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ] + %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ] + %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ] + %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1 + %_tmp987 = load i16, i16* %0, align 1 + %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1 + %_tmp993 = load i16, i16 addrspace(4)* %1, align 1 + %_tmp1013 = add i16 %tmp1, 1 + %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1 + %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1 + %_tmp1019 = icmp ult i16 %_tmp1013, 24 + br i1 %_tmp1019, label %bb4, label %bb16 + +bb16: + unreachable +} diff --git a/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll new file mode 100644 index 000000000000..1ae7dadeffd7 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: Checking a loop in "interleaved_access" +; CHECK: The Smallest and Widest types: 64 / 64 bits +; +define void @interleaved_access(i8** %A, i64 %N) { +for.ph: + br label %for.body + +for.body: + %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ] + %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i + store i8* null, i8** %tmp0, align 8 + %i.next.0 = add nuw nsw i64 %i, 1 + %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0 + store i8* null, i8** %tmp1, align 8 + %i.next.1 = add nsw i64 %i, 2 + %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1 + store i8* null, i8** %tmp2, align 8 + %i.next.2 = add nsw i64 %i, 3 + %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2 + store i8* null, i8** %tmp3, align 8 + %i.next.3 = add nsw i64 %i, 4 + %cond = icmp slt i64 %i.next.3, %N + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg b/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg new file mode 100644 index 000000000000..2a665f06be72 --- /dev/null +++ b/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll new file mode 100644 index 000000000000..f303ed5377e2 --- /dev/null +++ b/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll @@ -0,0 +1,28 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize < %s | FileCheck %s + + +; For AMDGPU, loop unroll in loop vectorizer is disabled when VF==1. +; +; CHECK-LABEL: @small_loop( +; CHECK: store i32 +; CHECK-NOT: store i32 +; CHECK: ret +define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind { +entry: + %0 = icmp sgt i32 %size, 0 + br i1 %0, label %loop, label %exit + +loop: ; preds = %entry, %loop + %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ] + %1 = getelementptr inbounds i32, i32* %inArray, i32 %iv + %2 = load i32, i32* %1, align 4 + %3 = add nsw i32 %2, 6 + store i32 %3, i32* %1, align 4 + %iv1 = add i32 %iv, 1 +; %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %cond = icmp eq i32 %iv1, %size + br i1 %cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index de3626b57d83..29adec049f67 100644 --- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -1,39 +1,147 @@ -; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "armv8--linux-gnueabihf" -@AB = common global [1024 x i8] zeroinitializer, align 4 -@CD = common global [1024 x i8] zeroinitializer, align 4 +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} -define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +%i16.2 = type {i16, i16} +define void @i16_factor_2(%i16.2* %data, i64 %n) { entry: br label %for.body -; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved -; access group is 2. - -; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 -; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv - %tmp = load i8, i8* %arrayidx0, align 4 - %tmp1 = or i64 %indvars.iv, 1 - %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 - %tmp2 = load i8, i8* %arrayidx1, align 4 - %add = add nsw i8 %tmp, %C - %mul = mul nsw i8 %tmp2, %D - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv - store i8 %add, i8* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 - store i8 %mul, i8* %arrayidx3, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp slt i64 %indvars.iv.next, 1024 - br i1 %cmp, label %for.body, label %for.end - -for.end: ; preds = %for.body +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1 + %tmp2 = load i16, i16* %tmp0, align 2 + %tmp3 = load i16, i16* %tmp1, align 2 + store i16 0, i16* %tmp0, align 2 + store i16 0, i16* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i32.2 = type {i32, i32} +define void @i32_factor_2(%i32.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1 + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp0, align 4 + store i32 0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%half.2 = type {half, half} +define void @half_factor_2(%half.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_4-LABEL: Checking a loop in "half_factor_2" +; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "half_factor_2" +; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1 + %tmp2 = load half, half* %tmp0, align 2 + %tmp3 = load half, half* %tmp1, align 2 + store half 0., half* %tmp0, align 2 + store half 0., half* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ret void } diff --git a/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll b/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll new file mode 100644 index 000000000000..d2e594520332 --- /dev/null +++ b/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll @@ -0,0 +1,38 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: -force-vector-width=2 -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s + +; Check costs for branches inside a vectorized loop around predicated +; blocks. Each such branch will be guarded with an extractelement from the +; vector compare plus a test under mask instruction. This cost is modelled on +; the extractelement of i1. + +define void @fun(i32* %arr, i64 %trip.count) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv + %l = load i32, i32* %arrayidx, align 4 + %cmp55 = icmp sgt i32 %l, 0 + br i1 %cmp55, label %if.then, label %for.inc + +if.then: + %sub = sub nsw i32 0, %l + store i32 %sub, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + ret void + +; CHECK: LV: Found an estimated cost of 5 for VF 2 For instruction: br i1 %cmp55, label %if.then, label %for.inc +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br label %for.inc +; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond, label %for.end.loopexit, label %for.body +} diff --git a/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg b/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg new file mode 100644 index 000000000000..2f3cf7d3f043 --- /dev/null +++ b/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'SystemZ' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll new file mode 100644 index 000000000000..e7096c29b994 --- /dev/null +++ b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \ +; RUN: FileCheck %s +; +; Check that a scalarized load/store does not get a cost for insterts/ +; extracts, since z13 supports element load/store. + +define void @fun(i32* %data, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %tmp2 = add i32 %tmp1, 1 + store i32 %tmp2, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 +} + diff --git a/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll new file mode 100644 index 000000000000..5c15ee4f2d9f --- /dev/null +++ b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll @@ -0,0 +1,70 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; +; Check that the loop vectorizer performs memory interleaving with accurate +; cost estimations. + + +; Simple case where just the load is interleaved, because the store group +; would have gaps. +define void @fun0(i32* %data, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %tmp2 = add i32 %tmp1, 1 + store i32 %tmp2, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void + +; CHECK: LV: Creating an interleave group with: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; (vl; vl; vperm) +} + +; Interleaving of both load and stores. +define void @fun1(i32* %data, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %i_1 = add i64 %i, 1 + %tmp2 = getelementptr inbounds i32, i32* %data, i64 %i_1 + %tmp3 = load i32, i32* %tmp2, align 4 + store i32 %tmp1, i32* %tmp2, align 4 + store i32 %tmp3, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void + +; CHECK: LV: Creating an interleave group with: store i32 %tmp3, i32* %tmp0, align 4 +; CHECK: LV: Inserted: store i32 %tmp1, i32* %tmp2, align 4 +; CHECK: into the interleave group with store i32 %tmp3, i32* %tmp0, align 4 +; CHECK: LV: Creating an interleave group with: %tmp3 = load i32, i32* %tmp2, align 4 +; CHECK: LV: Inserted: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: into the interleave group with %tmp3 = load i32, i32* %tmp2, align 4 + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp2, align 4 +; (vl; vl; vperm, vpkg) + +; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp1, i32* %tmp2, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp3, i32* %tmp0, align 4 +; (vmrlf; vmrhf; vst; vst) +} + diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll index fb01454c253b..1eb1cd3f5d7a 100644 --- a/test/Transforms/LoopVectorize/X86/avx512.ll +++ b/test/Transforms/LoopVectorize/X86/avx512.ll @@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.9.0" ; loop. ; CHECK-LABEL: f: -; CHECK: vmovups %zmm{{.}}, +; CHECK: vmovdqu32 %zmm{{.}}, ; CHECK-NOT: %ymm define void @f(i32* %a, i32 %n) { diff --git a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index 32bfcd2275ac..82f2e064a581 100644 --- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -13,22 +13,33 @@ target triple = "x86_64-unknown-linux-gnu" ; scatter operation. %tmp3 (and the induction variable) should not be marked ; uniform-after-vectorization. ; -; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 -; CHECK: vector.body: -; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0 -; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] -; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 -; CHECK: %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]] -; CHECK: %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T4]], align 4 -; CHECK: %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind -; CHECK: call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}}) -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]] +; CHECK-NEXT: call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80> +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body %data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] } diff --git a/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll new file mode 100644 index 000000000000..76b6cae5c3b4 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test checks that "gather" operation is choosen since it's cost is better +; than interleaving pattern. +; +;unsigned long A[SIZE]; +;unsigned long B[SIZE]; +; +;void foo() { +; for (int i=0; i<N; i+=8) { +; B[i] = A[i] + 5; +; } +;} + +@A = global [10240 x i64] zeroinitializer, align 16 +@B = global [10240 x i64] zeroinitializer, align 16 + + +; CHECK_LABEL: strided_load_i64 +; CHECK: masked.gather +define void @strided_load_i64() { + br label %1 + +; <label>:1: ; preds = %0, %1 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv + %3 = load i64, i64* %2, align 16 + %4 = add i64 %3, 5 + %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %4, i64* %5, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %6 = icmp slt i64 %indvars.iv.next, 1024 + br i1 %6, label %1, label %7 + +; <label>:7: ; preds = %1 + ret void +} + diff --git a/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/test/Transforms/LoopVectorize/X86/int128_no_gather.ll index fbea275cb40f..4d7c0b6f64b8 100644 --- a/test/Transforms/LoopVectorize/X86/int128_no_gather.ll +++ b/test/Transforms/LoopVectorize/X86/int128_no_gather.ll @@ -71,6 +71,6 @@ declare i32 @printf(i8*, ...) #1 ; Function Attrs: nounwind declare i32 @puts(i8* nocapture readonly) #2 -attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/test/Transforms/LoopVectorize/X86/interleaving.ll b/test/Transforms/LoopVectorize/X86/interleaving.ll index de5db5324381..9294c92b5759 100644 --- a/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM ; NORMAL-LABEL: foo diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll index 74c0c16086fe..e1793bcc3218 100644 --- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -1,13 +1,14 @@ ; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1 ; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2 -; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-threshold=150 -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DEFAULT ; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os ; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz ; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC ; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC ; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2 ; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2 -; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS +; RUN: opt < %s -mcpu=corei7 -O3 -unroll-threshold=150 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS ; This file tests the llvm.loop.vectorize.enable metadata forcing ; vectorization even when optimization levels are too low, or when @@ -25,6 +26,9 @@ target triple = "x86_64-unknown-linux-gnu" ; O3-LABEL: @enabled( ; O3: store <4 x i32> ; O3: ret i32 +; O3DEFAULT-LABEL: @enabled( +; O3DEFAULT: store <4 x i32> +; O3DEFAULT: ret i32 ; Pragma always wins! ; O3DIS-LABEL: @enabled( ; O3DIS: store <4 x i32> @@ -77,6 +81,9 @@ for.end: ; preds = %for.body ; O3-LABEL: @nopragma( ; O3: store <4 x i32> ; O3: ret i32 +; O3DEFAULT-LABEL: @nopragma( +; O3DEFAULT: store <4 x i32> +; O3DEFAULT: ret i32 ; O3DIS-LABEL: @nopragma( ; O3DIS-NOT: store <4 x i32> ; O3DIS: ret i32 @@ -128,6 +135,9 @@ for.end: ; preds = %for.body ; O3-LABEL: @disabled( ; O3-NOT: store <4 x i32> ; O3: ret i32 +; O3DEFAULT-LABEL: @disabled( +; O3DEFAULT: store <4 x i32> +; O3DEFAULT: ret i32 ; O3DIS-LABEL: @disabled( ; O3DIS-NOT: store <4 x i32> ; O3DIS: ret i32 diff --git a/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/test/Transforms/LoopVectorize/X86/scatter_crash.ll index ec67e632efbd..bda4b2454ee2 100755 --- a/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -16,97 +16,23 @@ target triple = "x86_64-apple-macosx10.11.0" define void @_Z3fn1v() #0 { ; CHECK-LABEL: @_Z3fn1v( ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX:%.*]].next, %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ -; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ -; CHECK-NEXT: [[SHL:%.*]] = shl i64 %index, 1 -; CHECK-NEXT: %offset.idx = add i64 [[SHL]], 8 -; CHECK-NEXT: [[IND00:%.*]] = add i64 %offset.idx, 0 -; CHECK-NEXT: [[IND02:%.*]] = add i64 %offset.idx, 2 -; CHECK-NEXT: [[IND04:%.*]] = add i64 %offset.idx, 4 -; CHECK-NEXT: [[IND06:%.*]] = add i64 %offset.idx, 6 -; CHECK-NEXT: [[IND08:%.*]] = add i64 %offset.idx, 8 -; CHECK-NEXT: [[IND10:%.*]] = add i64 %offset.idx, 10 -; CHECK-NEXT: [[IND12:%.*]] = add i64 %offset.idx, 12 -; CHECK-NEXT: [[IND14:%.*]] = add i64 %offset.idx, 14 -; CHECK-NEXT: [[IND16:%.*]] = add i64 %offset.idx, 16 -; CHECK-NEXT: [[IND18:%.*]] = add i64 %offset.idx, 18 -; CHECK-NEXT: [[IND20:%.*]] = add i64 %offset.idx, 20 -; CHECK-NEXT: [[IND22:%.*]] = add i64 %offset.idx, 22 -; CHECK-NEXT: [[IND24:%.*]] = add i64 %offset.idx, 24 -; CHECK-NEXT: [[IND26:%.*]] = add i64 %offset.idx, 26 -; CHECK-NEXT: [[IND28:%.*]] = add i64 %offset.idx, 28 -; CHECK-NEXT: [[IND30:%.*]] = add i64 %offset.idx, 30 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]] -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]] -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]] -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]] -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]] -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]] -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0 -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0 -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6 -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0 -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7 -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0 -; CHECK-NEXT: [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8 -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0 -; CHECK-NEXT: [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9 -; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0 -; CHECK-NEXT: [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10 -; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0 -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11 -; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0 -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0 -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13 -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14 -; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0 -; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0 -; CHECK-NEXT: [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) -; CHECK: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> -; CHECK: [[STEP_ADD4:%.*]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> +; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; entry: %0 = load i32, i32* @c, align 4 %cmp34 = icmp sgt i32 %0, 8 diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll index f28e6be23529..b2933c4b56f2 100644 --- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll @@ -1,4 +1,6 @@ ; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -o /dev/null -pass-remarks-output=%t.yaml +; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s ; C/C++ code for tests ; void test(int *A, int Length) { @@ -42,6 +44,61 @@ ; CHECK-NOT: x i32> ; CHECK: ret +; YAML: --- !Analysis +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: CantComputeNumberOfIterations +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 4, Column: 5 } +; YAML-NEXT: Function: _Z4testPii +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: could not determine number of loop iterations +; YAML-NEXT: ... +; YAML-NEXT: --- !Missed +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: MissedDetails +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 4, Column: 5 } +; YAML-NEXT: Function: _Z4testPii +; YAML-NEXT: Args: +; YAML-NEXT: - String: loop not vectorized +; YAML-NEXT: ... +; YAML-NEXT: --- !Analysis +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: AllDisabled +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 13, Column: 5 } +; YAML-NEXT: Function: _Z13test_disabledPii +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1' +; YAML-NEXT: ... +; YAML-NEXT: --- !Analysis +; YAML-NEXT: Pass: '' +; YAML-NEXT: Name: CantIdentifyArrayBounds +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 19, Column: 5 } +; YAML-NEXT: Function: _Z17test_array_boundsPiS_i +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: cannot identify array bounds +; YAML-NEXT: ... +; YAML-NEXT: --- !Missed +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: MissedDetails +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 19, Column: 5 } +; YAML-NEXT: Function: _Z17test_array_boundsPiS_i +; YAML-NEXT: Args: +; YAML-NEXT: - String: loop not vectorized +; YAML-NEXT: - String: ' (Force=' +; YAML-NEXT: - Force: 'true' +; YAML-NEXT: - String: ')' +; YAML-NEXT: ... +; YAML-NEXT: --- !Failure +; YAML-NEXT: Pass: loop-vectorize +; YAML-NEXT: Name: FailedRequestedVectorization +; YAML-NEXT: DebugLoc: { File: source.cpp, Line: 19, Column: 5 } +; YAML-NEXT: Function: _Z17test_array_boundsPiS_i +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'loop not vectorized: ' +; YAML-NEXT: - String: failed explicitly specified loop vectorization +; YAML-NEXT: ... + target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind optsize ssp uwtable diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll index fc9f97328fb7..91466e65078f 100644 --- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -pass-remarks-analysis='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s ; Verify analysis remarks are generated when interleaving is not beneficial. ; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial diff --git a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll index 88b2aa36b08c..125829090c3f 100644 --- a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -438,3 +438,53 @@ for.end: %tmp5 = phi i32 [ %tmp2, %for.body ] ret i32 %tmp5 } + +; INTER-LABEL: bitcast_pointer_operand +; +; Check that a pointer operand having a user other than a memory access is +; recognized as uniform after vectorization. In this test case, %tmp1 is a +; bitcast that is used by a load and a getelementptr instruction (%tmp2). Once +; %tmp2 is marked uniform, %tmp1 should be marked uniform as well. +; +; INTER: LV: Found uniform instruction: %cond = icmp slt i64 %i.next, %n +; INTER-NEXT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3 +; INTER-NEXT: LV: Found uniform instruction: %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i +; INTER-NEXT: LV: Found uniform instruction: %tmp1 = bitcast i64* %tmp0 to i8* +; INTER-NEXT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i +; INTER-NEXT: LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] +; INTER-NEXT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1 +; INTER: vector.body: +; INTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; INTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]] +; INTER-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <32 x i8>* +; INTER-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, <32 x i8>* [[TMP5]], align 1 +; INTER-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> +; INTER-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27> +; INTER-NEXT: [[TMP6:%.*]] = xor <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]] +; INTER-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* %B, i64 [[INDEX]] +; INTER-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* +; INTER-NEXT: store <4 x i8> [[TMP6]], <4 x i8>* [[TMP8]], align 1 +; INTER-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; INTER: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @bitcast_pointer_operand(i64* %A, i8* %B, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i + %tmp1 = bitcast i64* %tmp0 to i8* + %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3 + %tmp3 = load i8, i8* %tmp2, align 1 + %tmp4 = load i8, i8* %tmp1, align 1 + %tmp5 = xor i8 %tmp3, %tmp4 + %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i + store i8 %tmp5, i8* %tmp6 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/discriminator.ll b/test/Transforms/LoopVectorize/discriminator.ll new file mode 100644 index 000000000000..b7d34582dbd8 --- /dev/null +++ b/test/Transforms/LoopVectorize/discriminator.ll @@ -0,0 +1,70 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck --check-prefix=LOOPVEC_4_1 %s +; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=3 < %s | FileCheck --check-prefix=LOOPVEC_2_3 %s +; RUN: opt -S -loop-unroll -unroll-count=5 < %s | FileCheck --check-prefix=LOOPUNROLL_5 %s +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -loop-unroll -unroll-count=2 < %s | FileCheck --check-prefix=LOOPVEC_UNROLL %s + +; Test if vectorization/unroll factor is recorded in discriminator. +; +; Original source code: +; 1 int *a; +; 2 int *b; +; 3 +; 4 void foo() { +; 5 for (int i = 0; i < 4096; i++) +; 6 a[i] += b[i]; +; 7 } + +@a = local_unnamed_addr global i32* null, align 8 +@b = local_unnamed_addr global i32* null, align 8 + +define void @_Z3foov() local_unnamed_addr #0 !dbg !6 { + %1 = load i32*, i32** @b, align 8, !dbg !8, !tbaa !9 + %2 = load i32*, i32** @a, align 8, !dbg !13, !tbaa !9 + br label %3, !dbg !14 + +; <label>:3: ; preds = %3, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %3 ] + %4 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv, !dbg !8 + %5 = load i32, i32* %4, align 4, !dbg !8, !tbaa !15 + %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv, !dbg !13 + %7 = load i32, i32* %6, align 4, !dbg !17, !tbaa !15 + %8 = add nsw i32 %7, %5, !dbg !17 + store i32 %8, i32* %6, align 4, !dbg !17, !tbaa !15 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !18 + %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !19 + br i1 %exitcond, label %9, label %3, !dbg !14, !llvm.loop !20 + +; <label>:9: ; preds = %3 + ret void, !dbg !21 +} + +;LOOPVEC_4_1: discriminator: 17 +;LOOPVEC_2_3: discriminator: 25 +;LOOPUNROLL_5: discriminator: 21 +; When unrolling after loop vectorize, both vec_body and remainder loop +; are unrolled. +;LOOPVEC_UNROLL: discriminator: 385 +;LOOPVEC_UNROLL: discriminator: 9 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, debugInfoForProfiling: true) +!1 = !DIFile(filename: "a.cc", directory: "/") +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0) +!8 = !DILocation(line: 6, column: 13, scope: !6) +!9 = !{!10, !10, i64 0} +!10 = !{!"any pointer", !11, i64 0} +!11 = !{!"omnipotent char", !12, i64 0} +!12 = !{!"Simple C++ TBAA"} +!13 = !DILocation(line: 6, column: 5, scope: !6) +!14 = !DILocation(line: 5, column: 3, scope: !6) +!15 = !{!16, !16, i64 0} +!16 = !{!"int", !11, i64 0} +!17 = !DILocation(line: 6, column: 10, scope: !6) +!18 = !DILocation(line: 5, column: 30, scope: !6) +!19 = !DILocation(line: 5, column: 21, scope: !6) +!20 = distinct !{!20, !14} +!21 = !DILocation(line: 7, column: 1, scope: !6) diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll new file mode 100644 index 000000000000..3d1c78038e32 --- /dev/null +++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -0,0 +1,398 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; void recurrence_1(int *a, int *b, int n) { +; for(int i = 0; i < n; i++) +; b[i] = a[i] + a[i - 1] +; } +; +; CHECK-LABEL: @recurrence_1( +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3 +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: [[L1]] = load <4 x i32> +; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ] +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] +; +; UNROLL-LABEL: @recurrence_1( +; UNROLL: vector.body: +; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] +; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> +; UNROLL: [[L2]] = load <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL: middle.block: +; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 +; +define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { +entry: + br label %for.preheader + +for.preheader: + %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 + %pre_load = load i32, i32* %arrayidx.phi.trans.insert + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, i32* %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret void +} + +; int recurrence_2(int *a, int n) { +; int minmax; +; for (int i = 0; i < n; ++i) +; minmax = min(minmax, max(a[i] - a[i-1], 0)); +; return minmax; +; } +; +; CHECK-LABEL: @recurrence_2( +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: [[L1]] = load <4 x i32> +; CHECK: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ] +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] +; +; UNROLL-LABEL: @recurrence_2( +; UNROLL: vector.body: +; UNROLL: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] +; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32> +; UNROLL: [[L2]] = load <4 x i32> +; UNROLL: {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL: {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL: middle.block: +; UNROLL: %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3 +; +define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.preheader, label %for.cond.cleanup + +for.preheader: + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 + %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + br label %scalar.body + +for.cond.cleanup.loopexit: + %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ] + br label %for.cond.cleanup + +for.cond.cleanup: + %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %minmax.0.lcssa + +scalar.body: + %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %sub3 = sub nsw i32 %1, %0 + %cmp4 = icmp sgt i32 %sub3, 0 + %cond = select i1 %cmp4, i32 %sub3, i32 0 + %cmp5 = icmp slt i32 %minmax.028, %cond + %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body +} + +; void recurrence_3(short *a, double *b, int n, float f, short p) { +; b[0] = (double)a[0] - f * (double)p; +; for (int i = 1; i < n; i++) +; b[i] = (double)a[i] - f * (double)a[i - 1]; +; } +; +; CHECK-LABEL: @recurrence_3( +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3 +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ] +; CHECK: [[L1]] = load <4 x i16> +; CHECK: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ] +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] +; +; UNROLL-LABEL: @recurrence_3( +; UNROLL: vector.body: +; UNROLL: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ] +; UNROLL: [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16> +; UNROLL: [[L2]] = load <4 x i16> +; UNROLL: {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL: {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL: middle.block: +; UNROLL: %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3 +; +define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = sitofp i16 %0 to double + %conv1 = fpext float %f to double + %conv2 = sitofp i16 %p to double + %mul = fmul fast double %conv2, %conv1 + %sub = fsub fast double %conv, %mul + store double %sub, double* %b, align 8 + %cmp25 = icmp sgt i32 %n, 1 + br i1 %cmp25, label %for.preheader, label %for.end + +for.preheader: + br label %scalar.body + +scalar.body: + %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ] + %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ] + %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv + %2 = load i16, i16* %arrayidx5, align 2 + %conv6 = sitofp i16 %2 to double + %conv11 = sitofp i16 %1 to double + %mul12 = fmul fast double %conv11, %conv1 + %sub13 = fsub fast double %conv6, %mul12 + %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv + store double %sub13, double* %arrayidx15, align 8 + %advars.iv.next = add nuw nsw i64 %advars.iv, 1 + %lftr.wideiv = trunc i64 %advars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %scalar.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; void PR26734(short *a, int *b, int *c, int d, short *e) { +; for (; d != 21; d++) { +; *b &= *c; +; *e = *a - 6; +; *c = *e; +; } +; } +; +; CHECK-LABEL: @PR26734( +; CHECK-NOT: vector.ph: +; CHECK: } +; +define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) { +entry: + %cmp4 = icmp eq i32 %d, 21 + br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph + +entry.for.end_crit_edge: + %.pre = load i32, i32* %b, align 4 + br label %for.end + +for.body.lr.ph: + %0 = load i16, i16* %a, align 2 + %sub = add i16 %0, -6 + %conv2 = sext i16 %sub to i32 + %c.promoted = load i32, i32* %c, align 4 + %b.promoted = load i32, i32* %b, align 4 + br label %for.body + +for.body: + %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ] + %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ] + %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ] + %and = and i32 %and6, %conv25 + %inc = add nsw i32 %inc7, 1 + %cmp = icmp eq i32 %inc, 21 + br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + %and.lcssa = phi i32 [ %and, %for.body ] + store i32 %conv2, i32* %c, align 4 + store i32 %and.lcssa, i32* %b, align 4 + store i16 %sub, i16* %e, align 2 + br label %for.end + +for.end: + ret void +} + +; int PR27246() { +; unsigned int e, n; +; for (int i = 1; i < 49; ++i) { +; for (int k = i; k > 1; --k) +; e = k; +; n = e; +; } +; return n; +; } +; +; CHECK-LABEL: @PR27246( +; CHECK-NOT: vector.ph: +; CHECK: } +; +define i32 @PR27246() { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ] + %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ] + br label %for.cond1 + +for.cond.cleanup: + %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ] + ret i32 %e.1.lcssa.lcssa + +for.cond1: + %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ] + %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ] + %cmp2 = icmp sgt i32 %k.0, 1 + %dec = add nsw i32 %k.0, -1 + br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3 + +for.cond.cleanup3: + %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ] + %inc = add nuw nsw i32 %i.016, 1 + %exitcond = icmp eq i32 %inc, 49 + br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader +} + +; UNROLL-NO-IC-LABEL: @PR30183( +; UNROLL-NO-IC: vector.ph: +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3 +; UNROLL-NO-IC-NEXT: br label %vector.body +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], %vector.ph ], [ [[TMP42:%.*]], %vector.body ] +; UNROLL-NO-IC: [[TMP27:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = load i32, i32* {{.*}} +; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2 +; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3 +; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0 +; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1 +; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2 +; UNROLL-NO-IC-NEXT: [[TMP42]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP34]], i32 3 +; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; UNROLL-NO-IC: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @PR30183(i32 %pre_load, i32* %a, i32* %b, i64 %n) { +entry: + br label %scalar.body + +scalar.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ] + %tmp0 = phi i32 [ %pre_load, %entry ], [ %tmp2, %scalar.body ] + %i.next = add nuw nsw i64 %i, 2 + %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i.next + %tmp2 = load i32, i32* %tmp1 + %cond = icmp eq i64 %i.next,%n + br i1 %cond, label %for.end, label %scalar.body + +for.end: + ret void +} + +; UNROLL-NO-IC-LABEL: @constant_folded_previous_value( +; UNROLL-NO-IC: vector.body: +; UNROLL-NO-IC: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ <i64 1, i64 1, i64 1, i64 1>, %vector.body ] +; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6> +; UNROLL-NO-IC: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @constant_folded_previous_value() { +entry: + br label %scalar.body + +scalar.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ] + %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ] + %tmp3 = add i64 0, 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, undef + br i1 %cond, label %for.end, label %scalar.body + +for.end: + ret void +} + +; We vectorize this first order recurrence, by generating two +; extracts for the phi `val.phi` - one at the last index and +; another at the second last index. We need these 2 extracts because +; the first order recurrence phi is used outside the loop, so we require the phi +; itself and not its update (addx). +; UNROLL-NO-IC-LABEL: extract_second_last_iteration +; UNROLL-NO-IC: vector.body +; UNROLL-NO-IC: %step.add = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> +; UNROLL-NO-IC: %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat +; UNROLL-NO-IC: %[[L2:.+]] = add <4 x i32> %step.add, %broadcast.splat +; UNROLL-NO-IC: %index.next = add i32 %index, 8 +; UNROLL-NO-IC: icmp eq i32 %index.next, 96 +; UNROLL-NO-IC: middle.block +; UNROLL-NO-IC: icmp eq i32 96, 96 +; UNROLL-NO-IC: %vector.recur.extract = extractelement <4 x i32> %[[L2]], i32 3 +; UNROLL-NO-IC: %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L2]], i32 2 +; UNROLL-NO-IC: for.end +; UNROLL-NO-IC: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] +; Check the case when unrolled but not vectorized. +; UNROLL-NO-VF-LABEL: extract_second_last_iteration +; UNROLL-NO-VF: vector.body: +; UNROLL-NO-VF: %induction = add i32 %index, 0 +; UNROLL-NO-VF: %induction1 = add i32 %index, 1 +; UNROLL-NO-VF: %[[L1:.+]] = add i32 %induction, %x +; UNROLL-NO-VF: %[[L2:.+]] = add i32 %induction1, %x +; UNROLL-NO-VF: %index.next = add i32 %index, 2 +; UNROLL-NO-VF: icmp eq i32 %index.next, 96 +; UNROLL-NO-VF: for.end: +; UNROLL-NO-VF: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %[[L1]], %middle.block ] +define i32 @extract_second_last_iteration(i32* %cval, i32 %x) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ] + %inc = add i32 %inc.phi, 1 + %bc = zext i32 %inc.phi to i64 + %addx = add i32 %inc.phi, %x + %cmp = icmp eq i32 %inc.phi, 95 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %val.phi +} diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll index 79bddf471c26..8eec6e262c1a 100644 --- a/test/Transforms/LoopVectorize/float-induction.ll +++ b/test/Transforms/LoopVectorize/float-induction.ll @@ -1,43 +1,7 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s - -; VEC4_INTERL1-LABEL: @fp_iv_loop1( -; VEC4_INTERL1: %[[FP_INC:.*]] = load float, float* @fp_inc -; VEC4_INTERL1: vector.body: -; VEC4_INTERL1: %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float -; VEC4_INTERL1: %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]] -; VEC4_INTERL1: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]] -; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0 -; VEC4_INTERL1-NEXT: %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1: %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]] -; VEC4_INTERL1-NEXT: %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer -; VEC4_INTERL1: %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> -; VEC4_INTERL1-NEXT: %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]] -; VEC4_INTERL1: store <4 x float> %[[VEC_INDUCTION]] - -; VEC4_INTERL2-LABEL: @fp_iv_loop1( -; VEC4_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc -; VEC4_INTERL2: vector.body: -; VEC4_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float -; VEC4_INTERL2: %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]] -; VEC4_INTERL2: fsub fast float %init, %[[VEC_INCR]] -; VEC4_INTERL2: %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> -; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]] -; VEC4_INTERL2: %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00> -; VEC4_INTERL2-NEXT: %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]] -; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION1]] -; VEC4_INTERL2: store <4 x float> %[[VEC_INDUCTION2]] - -; VEC1_INTERL2-LABEL: @fp_iv_loop1( -; VEC1_INTERL2: %[[FP_INC:.*]] = load float, float* @fp_inc -; VEC1_INTERL2: vector.body: -; VEC1_INTERL2: %[[INDEX:.*]] = sitofp i64 {{.*}} to float -; VEC1_INTERL2: %[[STEP:.*]] = fmul fast float %{{.*}}, %[[INDEX]] -; VEC1_INTERL2: %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[STEP]] -; VEC1_INTERL2: %[[SCALAR_INDUCTION2:.*]] = fsub fast float %[[FP_OFFSET_IDX]], %[[FP_INC]] -; VEC1_INTERL2: store float %[[FP_OFFSET_IDX]] -; VEC1_INTERL2: store float %[[SCALAR_INDUCTION2]] +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s @fp_inc = common global float 0.000000e+00, align 4 @@ -49,6 +13,71 @@ ; } ;} +; VEC4_INTERL1-LABEL: @fp_iv_loop1( +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> +; VEC4_INTERL1-NEXT: [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]] +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]] +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body + +; VEC4_INTERL2-LABEL: @fp_iv_loop1( +; VEC4_INTERL2: vector.ph: +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> +; VEC4_INTERL2-NEXT: [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]] +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: br label %vector.body +; VEC4_INTERL2: vector.body: +; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4 +; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4 +; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4 +; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]] +; VEC4_INTERL2: br i1 {{.*}}, label %middle.block, label %vector.body + +; VEC1_INTERL2-LABEL: @fp_iv_loop1( +; VEC1_INTERL2: vector.ph: +; VEC1_INTERL2-NEXT: br label %vector.body +; VEC1_INTERL2: vector.body: +; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fmul fast float %fpinc, [[TMP6]] +; VEC1_INTERL2-NEXT: [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP7]] +; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = fsub fast float [[FP_OFFSET_IDX]], %fpinc +; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDUCTION2]] +; VEC1_INTERL2-NEXT: store float [[FP_OFFSET_IDX]], float* [[TMP9]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP8]], float* [[TMP10]], align 4 +; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VEC1_INTERL2: br i1 {{.*}}, label %middle.block, label %vector.body + define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 { entry: %cmp4 = icmp sgt i32 %N, 0 @@ -85,15 +114,20 @@ for.end: ; preds = %for.end.loopexit, % ;} ; VEC4_INTERL1-LABEL: @fp_iv_loop2( -; VEC4_INTERL1: vector.body -; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ] -; VEC4_INTERL1: sitofp i64 %[[index]] to float -; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01 -; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]] -; VEC4_INTERL1: insertelement <4 x float> undef, float %[[VAR2]], i32 0 -; VEC4_INTERL1: shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer -; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00> -; VEC4_INTERL1: store <4 x float> +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00> +; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 { entry: @@ -133,14 +167,43 @@ for.end: ; preds = %for.end.loopexit, % ; C[i] = y; ; } ;} + ; VEC4_INTERL1-LABEL: @fp_iv_loop3( -; VEC4_INTERL1: vector.body -; VEC4_INTERL1: %[[index:.*]] = phi i64 [ 0, %vector.ph ] -; VEC4_INTERL1: sitofp i64 %[[index]] to float -; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01 -; VEC4_INTERL1: fadd fast float %[[VAR1]] -; VEC4_INTERL1: fadd fast <4 x float> {{.*}}, <float -5.000000e-01, float -1.000000e+00, float -1.500000e+00, float -2.000000e+00> -; VEC4_INTERL1: store <4 x float> +; VEC4_INTERL1: for.body.lr.ph: +; VEC4_INTERL1: [[TMP0:%.*]] = load float, float* @fp_inc, align 4 +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> +; VEC4_INTERL1-NEXT: [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0 +; VEC4_INTERL1-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 +; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer +; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4 +; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]] +; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01> +; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]] +; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[TMP16]], <4 x float>* [[TMP18]], align 4 +; VEC4_INTERL1-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* %C, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00> +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 { entry: @@ -186,10 +249,17 @@ for.end: ;} ; VEC4_INTERL1-LABEL: @fp_iv_loop4( -; VEC4_INTERL1: vector.body -; VEC4_INTERL1-NOT: fmul fast <4 x float> -; VEC4_INTERL1: %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00> -; VEC4_INTERL1: store <4 x float> %[[induction]] +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> +; VEC4_INTERL1: br i1 {{.*}}, label %middle.block, label %vector.body define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) { entry: @@ -216,3 +286,55 @@ for.end.loopexit: ; preds = %for.body for.end: ; preds = %for.end.loopexit, %entry ret void } + +; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar( +; VEC2_INTERL1_PRED_STORE: vector.body: +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* +; VEC2_INTERL1_PRED_STORE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF]]: +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP1]], float* [[TMP2]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_CONTINUE]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_IF6]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 1 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]] +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP9]], float* [[TMP11]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; VEC2_INTERL1_PRED_STORE: [[PRED_STORE_CONTINUE7]]: +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VEC2_INTERL1_PRED_STORE: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @non_primary_iv_float_scalar(float* %A, i64 %N) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.inc ], [ 0, %entry ] + %j = phi float [ %j.next, %for.inc ], [ 0.0, %entry ] + %tmp0 = getelementptr inbounds float, float* %A, i64 %i + %tmp1 = load float, float* %tmp0, align 4 + %tmp2 = fcmp fast oeq float %tmp1, 0.0 + br i1 %tmp2, label %if.pred, label %for.inc + +if.pred: + store float %j, float* %tmp0, align 4 + br label %for.inc + +for.inc: + %i.next = add nuw nsw i64 %i, 1 + %j.next = fadd fast float %j, 1.0 + %cond = icmp slt i64 %i.next, %N + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll index acf7b12540d3..d3a16e2075d1 100644 --- a/test/Transforms/LoopVectorize/if-conversion.ll +++ b/test/Transforms/LoopVectorize/if-conversion.ll @@ -18,9 +18,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @function0( ;CHECK: load <4 x i32> +;CHECK: icmp sle <4 x i32> ;CHECK: mul <4 x i32> ;CHECK: add <4 x i32> -;CHECK: icmp sle <4 x i32> ;CHECK: select <4 x i1> ;CHECK: ret i32 define i32 @function0(i32* nocapture %a, i32* nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp { @@ -71,8 +71,8 @@ for.end: ;CHECK-LABEL: @reduction_func( ;CHECK: load <4 x i32> -;CHECK: add <4 x i32> ;CHECK: icmp slt <4 x i32> +;CHECK: add <4 x i32> ;CHECK: select <4 x i1> ;CHECK: ret i32 define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp { diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll index c4368148caf9..a1837b352eef 100644 --- a/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -11,6 +11,7 @@ entry: ; VEC-LABEL: test ; VEC: %[[v0:.+]] = add i64 %index, 0 +; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100> ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true> ; VEC: %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]] @@ -21,7 +22,6 @@ entry: ; VEC: [[cond]]: ; VEC: %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0 ; VEC: %[[v9a:.+]] = add nsw i32 %[[v13]], 20 -; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] ; VEC: store i32 %[[v9a]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] ; diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll index f56456e82dfa..33e8ed067160 100644 --- a/test/Transforms/LoopVectorize/induction-step.ll +++ b/test/Transforms/LoopVectorize/induction-step.ll @@ -12,11 +12,30 @@ ;} ; CHECK-LABEL: @induction_with_global( -; CHECK: %[[INT_INC:.*]] = load i32, i32* @int_inc, align 4 -; CHECK: vector.body: -; CHECK: %[[VAR1:.*]] = insertelement <8 x i32> undef, i32 %[[INT_INC]], i32 0 -; CHECK: %[[VAR2:.*]] = shufflevector <8 x i32> %[[VAR1]], <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK: mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VAR2]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @int_inc, align 4 +; CHECK: vector.ph: +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]] +; CHECK-NEXT: [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP0]], 8 +; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ] +; CHECK: [[TMP8:%.*]] = add i64 %index, 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4 +; CHECK: %index.next = add i64 %index, 8 +; CHECK-NEXT: %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -66,13 +85,28 @@ for.end: ; preds = %for.end.loopexit, % ;} ; CHECK-LABEL: @induction_with_loop_inv( -; CHECK: for.cond1.preheader: -; CHECK: %[[INDVAR0:.*]] = phi i32 [ 0, -; CHECK: %[[INDVAR1:.*]] = phi i32 [ 0, -; CHECK: vector.body: -; CHECK: %[[VAR1:.*]] = insertelement <8 x i32> undef, i32 %[[INDVAR1]], i32 0 -; CHECK: %[[VAR2:.*]] = shufflevector <8 x i32> %[[VAR1]], <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK: mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VAR2]] +; CHECK: vector.ph: +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]] +; CHECK-NEXT: [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 %j.012, 8 +; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ] +; CHECK: [[TMP6:%.*]] = add i64 %index, 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4 +; CHECK: %index.next = add i64 %index, 8 +; CHECK-NEXT: %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body define i32 @induction_with_loop_inv(i32 %init, i32* noalias nocapture %A, i32 %N, i32 %M) { entry: @@ -122,3 +156,46 @@ for.end6: ; preds = %for.end6.loopexit, %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ] ret i32 %x.0.lcssa } + + +; CHECK-LABEL: @non_primary_iv_loop_inv_trunc( +; CHECK: vector.ph: +; CHECK: [[TMP3:%.*]] = trunc i64 %step to i32 +; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]] +; CHECK-NEXT: [[INDUCTION7:%.*]] = add <8 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP3]], 8 +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ] +; CHECK: [[TMP6:%.*]] = add i64 %index, 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: %index.next = add i64 %index, 8 +; CHECK: [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]] +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body + +define void @non_primary_iv_loop_inv_trunc(i32* %a, i64 %n, i64 %step) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = trunc i64 %j to i32 + store i32 %tmp1, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 1 + %j.next = add nuw nsw i64 %j, %step + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 6213b4a7c2e9..0d7d9fe0c1b8 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -7,11 +7,19 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Make sure that we can handle multiple integer induction variables. +; ; CHECK-LABEL: @multi_int_induction( -; CHECK: vector.body: -; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %[[VAR:.*]] = trunc i64 %index to i32 -; CHECK: %offset.idx = add i32 190, %[[VAR]] +; CHECK: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vec.ind = phi <2 x i32> [ <i32 190, i32 191>, %vector.ph ], [ %vec.ind.next, %vector.body ] +; CHECK: [[TMP3:%.*]] = add i64 %index, 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4 +; CHECK: %index.next = add i64 %index, 2 +; CHECK-NEXT: %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2> +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body define void @multi_int_induction(i32* %A, i32 %N) { for.body.lr.ph: br label %for.body @@ -765,3 +773,79 @@ for.body: exit: ret void } + +; CHECK-LABEL: @non_primary_iv_trunc( +; CHECK: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK: [[TMP3:%.*]] = add i64 %index, 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: %index.next = add i64 %index, 2 +; CHECK: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4> +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +define void @non_primary_iv_trunc(i32* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i + %tmp1 = trunc i64 %j to i32 + store i32 %tmp1, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 1 + %j.next = add nuw nsw i64 %j, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR32419. Ensure we transform truncated non-primary induction variables. In +; the test case below we replace %tmp1 with a new induction variable. Because +; the truncated value is non-primary, we must compute an offset from the +; primary induction variable. +; +; CHECK-LABEL: @PR32419( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE4:.*]] ] +; CHECK: [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 +; CHECK: [[TMP8:%.*]] = add i16 [[TMP1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = urem i16 %b, [[TMP8]] +; CHECK: [[TMP15:%.*]] = add i16 [[TMP1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = urem i16 %b, [[TMP15]] +; CHECK: [[PRED_UREM_CONTINUE4]]: +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define i32 @PR32419(i32 %a, i16 %b) { +entry: + br label %for.body + +for.body: + %i = phi i32 [ -20, %entry ], [ %i.next, %for.inc ] + %tmp0 = phi i32 [ %a, %entry ], [ %tmp6, %for.inc ] + %tmp1 = trunc i32 %i to i16 + %tmp2 = icmp eq i16 %tmp1, 0 + br i1 %tmp2, label %for.inc, label %for.cond + +for.cond: + %tmp3 = urem i16 %b, %tmp1 + br label %for.inc + +for.inc: + %tmp4 = phi i16 [ %tmp3, %for.cond ], [ 0, %for.body ] + %tmp5 = sext i16 %tmp4 to i32 + %tmp6 = or i32 %tmp0, %tmp5 + %i.next = add nsw i32 %i, 1 + %cond = icmp eq i32 %i.next, 0 + br i1 %cond, label %for.end, label %for.body + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +} diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll index e6bd6ed61e22..3d3ef9e05935 100644 --- a/test/Transforms/LoopVectorize/lcssa-crash.ll +++ b/test/Transforms/LoopVectorize/lcssa-crash.ll @@ -37,3 +37,26 @@ L0: L1: ret void } + +; This loop has different uniform instructions before and after LCSSA. +define void @test3() { +entry: + %add41 = add i32 undef, undef + %idxprom4736 = zext i32 %add41 to i64 + br label %while.body + +while.body: + %idxprom4738 = phi i64 [ %idxprom47, %while.body ], [ %idxprom4736, %entry ] + %pos.337 = phi i32 [ %inc46, %while.body ], [ %add41, %entry ] + %inc46 = add i32 %pos.337, 1 + %arrayidx48 = getelementptr inbounds [1024 x i8], [1024 x i8]* undef, i64 0, i64 %idxprom4738 + store i8 0, i8* %arrayidx48, align 1 + %and43 = and i32 %inc46, 3 + %cmp44 = icmp eq i32 %and43, 0 + %idxprom47 = zext i32 %inc46 to i64 + br i1 %cmp44, label %while.end, label %while.body + +while.end: + %add58 = add i32 %inc46, 4 + ret void +} diff --git a/test/Transforms/LoopVectorize/lifetime.ll b/test/Transforms/LoopVectorize/lifetime.ll index 6e525ca1d822..860fe2d983cd 100644 --- a/test/Transforms/LoopVectorize/lifetime.ll +++ b/test/Transforms/LoopVectorize/lifetime.ll @@ -13,23 +13,23 @@ define void @test(i32 *%d) { entry: %arr = alloca [1024 x i32], align 16 %0 = bitcast [1024 x i32]* %arr to i8* - call void @llvm.lifetime.start(i64 4096, i8* %0) #1 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 br label %for.body for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - call void @llvm.lifetime.end(i64 4096, i8* %0) #1 + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv %1 = load i32, i32* %arrayidx, align 8 store i32 100, i32* %arrayidx, align 8 - call void @llvm.lifetime.start(i64 4096, i8* %0) #1 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp ne i32 %lftr.wideiv, 128 br i1 %exitcond, label %for.body, label %for.end for.end: - call void @llvm.lifetime.end(i64 4096, i8* %0) #1 + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 ret void } @@ -42,24 +42,24 @@ define void @testbitcast(i32 *%d) { entry: %arr = alloca [1024 x i32], align 16 %0 = bitcast [1024 x i32]* %arr to i8* - call void @llvm.lifetime.start(i64 4096, i8* %0) #1 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1 br label %for.body for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %1 = bitcast [1024 x i32]* %arr to i8* - call void @llvm.lifetime.end(i64 4096, i8* %1) #1 + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1 %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv %2 = load i32, i32* %arrayidx, align 8 store i32 100, i32* %arrayidx, align 8 - call void @llvm.lifetime.start(i64 4096, i8* %1) #1 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp ne i32 %lftr.wideiv, 128 br i1 %exitcond, label %for.body, label %for.end for.end: - call void @llvm.lifetime.end(i64 4096, i8* %0) #1 + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1 ret void } @@ -77,11 +77,11 @@ for.body: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv %1 = bitcast [1024 x i32]* %arr to i8* - call void @llvm.lifetime.end(i64 4096, i8* %1) #1 + call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1 %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv %2 = load i32, i32* %arrayidx, align 8 store i32 100, i32* %arrayidx, align 8 - call void @llvm.lifetime.start(i64 4096, i8* %1) #1 + call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp ne i32 %lftr.wideiv, 128 @@ -91,6 +91,6 @@ for.end: ret void } -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 diff --git a/test/Transforms/LoopVectorize/loop-scalars.ll b/test/Transforms/LoopVectorize/loop-scalars.ll new file mode 100644 index 000000000000..4dcd5993c128 --- /dev/null +++ b/test/Transforms/LoopVectorize/loop-scalars.ll @@ -0,0 +1,143 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: vector_gep +; CHECK-NOT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>* +; CHECK-NEXT: store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2> +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @vector_gep(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: scalar_store +; CHECK: LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]] +; CHECK-NEXT: store i32* [[TMP5]], i32** [[TMP7]], align 8 +; CHECK-NEXT: store i32* [[TMP6]], i32** [[TMP8]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @scalar_store(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: expansion +; CHECK: LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32* +; CHECK-NEXT: LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0 +; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64** +; CHECK-NEXT: store i64* [[TMP5]], i64** [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64** +; CHECK-NEXT: store i64* [[TMP6]], i64** [[TMP10]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @expansion(i32** %a, i64 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i + %tmp1 = bitcast i64* %tmp0 to i32* + %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0 + %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i + store i32* %tmp1, i32** %tmp3, align 8 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: no_gep_or_bitcast +; CHECK-NOT: LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8 +; CHECK: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: store i32 0, i32* [[TMP4]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i + %tmp1 = load i32*, i32** %tmp0, align 8 + store i32 0, i32* %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll index a310b10a5c81..5c87dc435c7c 100644 --- a/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ b/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -13,9 +13,9 @@ ; int v3[Z][Z]; ; } s; ; -; void slow_function (s* const obj) { +; void slow_function (s* const obj, int z) { ; for (int j=0; j<Z; j++) { -; for (int k=0; k<Z; k++) { +; for (int k=0; k<z; k++) { ; int x = obj->v1[k] + obj->v2[j]; ; obj->v3[j][k] += x; ; } @@ -31,7 +31,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } -define void @Test(%struct.s* nocapture %obj) #0 { +define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { br label %.outer.preheader @@ -59,6 +59,6 @@ define void @Test(%struct.s* nocapture %obj) #0 { %8 = add nsw i32 %5, %7 store i32 %8, i32* %6 %j.next = add nuw nsw i64 %j, 1 - %exitcond.inner = icmp eq i64 %j.next, 32 + %exitcond.inner = icmp eq i64 %j.next, %z br i1 %exitcond.inner, label %.outer, label %.inner } diff --git a/test/Transforms/LoopVectorize/partial-lcssa.ll b/test/Transforms/LoopVectorize/partial-lcssa.ll new file mode 100644 index 000000000000..1306ed971c47 --- /dev/null +++ b/test/Transforms/LoopVectorize/partial-lcssa.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s +; We vectorize the inner loop, so we have to put it in LCSSA form. +; However, there's no reason to touch the outer loop. + +; CHECK-LABEL: @foo +; CHECK-LABEL: for.end.inner.loopexit: +; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ] +; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4 +; CHECK-LABEL: for.end.outer.loopexit +; CHECK: store i64 %indvars.outer, i64* %O2, align 4 + + +define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) { +entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer + +for.body.outer.preheader: ; preds = %entry + br label %for.body.outer + +for.body.outer: ; preds = %for.body.outer.preheader, %for.end.inner + %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ] + %cmp2 = icmp sgt i64 %m, 0 + br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner + +for.body.inner.preheader: ; preds = %for.body.outer + br label %for.body.inner + +for.body.inner: ; preds = %for.body.inner.preheader, %for.body.inner + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %v = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %v, i32* %arrayidx2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, %n + br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner + +for.end.inner.loopexit: ; preds = %for.body.inner + store i64 %indvars.iv, i64 *%O1, align 4 + br label %for.end.inner + +for.end.inner: ; preds = %for.end.inner.loopexit, %for.body.outer + %indvars.outer.next = add i64 %indvars.outer, 1 + %exitcond.outer = icmp eq i64 %indvars.outer, %m + br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer + +for.end.outer.loopexit: ; preds = %for.end.inner + store i64 %indvars.outer, i64 *%O2, align 4 + br label %for.end.outer + +for.end.outer: ; preds = %for.end.outer.loopexit, %entry + ret i64 undef +} diff --git a/test/Transforms/LoopVectorize/pr31098.ll b/test/Transforms/LoopVectorize/pr31098.ll new file mode 100644 index 000000000000..368a948557c3 --- /dev/null +++ b/test/Transforms/LoopVectorize/pr31098.ll @@ -0,0 +1,100 @@ +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -debug-only=loop-accesses < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check that the compile-time-unknown depenendece-distance is resolved +; statically. Due to the non-unit stride of the accesses in this testcase +; we are currently not able to create runtime dependence checks, and therefore +; if we don't resolve the dependence statically we cannot vectorize the loop. +; +; Specifically in this example, during dependence analysis we get 6 unknown +; dependence distances between the 8 real/imaginary accesses below: +; dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D. +; At compile time we can prove for all of the above that |dist|>loopBound*step +; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude +; that there are no dependencies (without runtime tests): +; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc. + +; #include <stdlib.h> +; class Complex { +; private: +; float real_; +; float imaginary_; +; +; public: +; Complex() : real_(0), imaginary_(0) { } +; Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { } +; Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { } +; +; inline float real() const { return real_; } +; inline float imaginary() const { return imaginary_; } +; +; Complex operator+(const Complex& rhs) const +; { +; return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_); +; } +; +; Complex operator-(const Complex& rhs) const +; { +; return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_); +; } +; }; +; +; void Test(Complex *out, size_t size) +; { +; size_t D = size / 2; +; for (size_t offset = 0; offset < D; ++offset) +; { +; Complex t0 = out[offset]; +; Complex t1 = out[offset + D]; +; out[offset] = t1 + t0; +; out[offset + D] = t0 - t1; +; } +; } + +; CHECK-LABEL: Test +; CHECK: LAA: No unsafe dependent memory operations in loop. We don't need runtime memory checks. +; CHECK: vector.body: +; CHECK: <4 x i32> + +%class.Complex = type { float, float } + +define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr { +entry: + %div = lshr i64 %size, 1 + %cmp47 = icmp eq i64 %div, 0 + br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0 + %1 = load float, float* %0, align 4 + %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1 + %2 = load float, float* %imaginary_.i.i, align 4 + %add = add nuw i64 %offset.048, %div + %3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0 + %4 = load float, float* %3, align 4 + %imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1 + %5 = load float, float* %imaginary_.i.i28, align 4 + %add.i = fadd fast float %4, %1 + %add4.i = fadd fast float %5, %2 + store float %add.i, float* %0, align 4 + store float %add4.i, float* %imaginary_.i.i, align 4 + %sub.i = fsub fast float %1, %4 + %sub4.i = fsub fast float %2, %5 + store float %sub.i, float* %3, align 4 + store float %sub4.i, float* %imaginary_.i.i28, align 4 + %inc = add nuw nsw i64 %offset.048, 1 + %exitcond = icmp eq i64 %inc, %div + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/test/Transforms/LoopVectorize/pr31190.ll b/test/Transforms/LoopVectorize/pr31190.ll index afb1754983cd..1ff8b2ba7ce4 100644 --- a/test/Transforms/LoopVectorize/pr31190.ll +++ b/test/Transforms/LoopVectorize/pr31190.ll @@ -9,13 +9,6 @@ ; Since %inc54 is the IV of the outer loop, and %0 equivalent to it, ; we get the situation described above. -; This test uses the new PM, because with the old PM, running loop-vectorize -; would explicitly run loop-simplify. Even though this loop is already in -; simplified form, loop-simplify would still clean up the phi. -; The reason this matters is that in a real optimizer pipeline, LICM can create -; such PHIs, and since it preserves loop simplified form, the cleanup has -; no chance to run. - ; Code that leads to this situation can look something like: ; ; int a, b[1], c; @@ -28,11 +21,14 @@ ; ; The PHI is an artifact of the register promotion of c. +; Note that we can no longer get the vectorizer to actually see such PHIs, +; because LV now simplifies the loop internally, but the test is still +; useful as a regression test, and in case loop-simplify behavior changes. + @c = external global i32, align 4 @a = external global i32, align 4 @b = external global [1 x i32], align 4 -; CHECK: LV: PHI is a recurrence with respect to an outer loop. ; CHECK: LV: Not vectorizing: Cannot prove legality. ; CHECK-LABEL: @test define void @test() { diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll index 4b300e04ea26..f521b623fad2 100644 --- a/test/Transforms/LoopVectorize/reduction.ll +++ b/test/Transforms/LoopVectorize/reduction.ll @@ -493,3 +493,49 @@ exit: %inc.2 = add nsw i32 %inc511.1.inc4.1, 2 ret i32 %inc.2 } + +;CHECK-LABEL: @reduction_sum_multiuse( +;CHECK: phi <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: %sum.lcssa = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ] +;CHECK: %sum.copy = phi i32 [ %[[SCALAR]], %.lr.ph ], [ %[[VECTOR]], %middle.block ] +;CHECK: ret i32 +define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph.preheader, label %end +.lr.ph.preheader: ; preds = %0 + br label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %3 = load i32, i32* %2, align 4 + %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %5 = load i32, i32* %4, align 4 + %6 = trunc i64 %indvars.iv to i32 + %7 = add i32 %sum.02, %6 + %8 = add i32 %7, %3 + %9 = add i32 %8, %5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.lcssa = phi i32 [ %9, %.lr.ph ] + %sum.copy = phi i32 [ %9, %.lr.ph ] + br label %end + +end: + %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ] + %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ] + %final = add i32 %f1, %f2 + ret i32 %final +} diff --git a/test/Transforms/LoopVectorize/reverse_iter.ll b/test/Transforms/LoopVectorize/reverse_iter.ll index a6e2abda36d9..bd057698280b 100644 --- a/test/Transforms/LoopVectorize/reverse_iter.ll +++ b/test/Transforms/LoopVectorize/reverse_iter.ll @@ -2,7 +2,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; Make sure that the reverse iterators are calculated using 64bit arithmetic, not 32. +; PR15882: This test ensures that we do not produce wrapping arithmetic when +; creating constant reverse step vectors. ; ; int foo(int n, int *A) { ; int sum; @@ -13,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; ;CHECK-LABEL: @foo( -;CHECK: <i64 0, i64 -1, i64 -2, i64 -3> +;CHECK: <i32 0, i32 -1, i32 -2, i32 -3> ;CHECK: ret define i32 @foo(i32 %n, i32* nocapture %A) { %1 = icmp sgt i32 %n, 0 diff --git a/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll b/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll new file mode 100644 index 000000000000..d3112b82d1d5 --- /dev/null +++ b/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S | FileCheck --enable-var-scope %s + +; Make sure we attach memcheck metadata to scalarized memory operations even if +; we're only unrolling. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: vector.memcheck: +; CHECK-LABEL: vector.body: +; CHECK: load i32, {{.*}} !alias.scope ![[$MD1:[0-9]+]] +; CHECK-LABEL: middle.block: +; CHECK-DAG: ![[$MD1]] = !{![[MD2:[0-9]+]]} +; CHECK-DAG: ![[MD2]] = distinct !{![[MD2]], ![[MD3:[0-9]+]]} +; CHECK-DAG: ![[MD3]] = distinct !{![[MD3]], !"LVerDomain"} + +; Function Attrs: norecurse nounwind uwtable +define void @test(i32* nocapture readonly %a, i32* nocapture %b) local_unnamed_addr #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 77 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { norecurse nounwind uwtable } diff --git a/test/Transforms/LoopVectorize/vector-geps.ll b/test/Transforms/LoopVectorize/vector-geps.ll new file mode 100644 index 000000000000..bd79499d5d34 --- /dev/null +++ b/test/Transforms/LoopVectorize/vector-geps.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @vector_gep_stored( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: @uniform_vector_gep_stored( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 1 + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} |