diff options
Diffstat (limited to 'test/Transforms/LoopVectorize')
39 files changed, 1529 insertions, 145 deletions
diff --git a/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll index f16ee4171da9..58315a73ec13 100644 --- a/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ b/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -1,5 +1,5 @@ -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses=true | FileCheck %s -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -enable-interleaved-mem-accesses=true | FileCheck %s --check-prefix=FORCE-VEC +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" diff --git a/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll new file mode 100644 index 000000000000..65f5c4e6266b --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -0,0 +1,54 @@ +; RUN: opt -S < %s -loop-vectorize -instcombine 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +;; See https://llvm.org/bugs/show_bug.cgi?id=25490 +;; Due to the data structures used, the LLVM IR was not determinisic. +;; This test comes from the PR. + +;; CHECK-LABEL: @test( +; CHECK: load <16 x i8> +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load <16 x i8> +; CHECK-NEXT: zext <16 x i8> +; CHECK-NEXT: zext <16 x i8> +define void @test(i32 %n, i8* nocapture %a, i8* nocapture %b, i8* nocapture readonly %c) { +entry: + %cmp.28 = icmp eq i32 %n, 0 + br i1 %cmp.28, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %c, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %mul = mul nuw nsw i32 %conv3, %conv + %shr.26 = lshr i32 %mul, 8 + %conv4 = trunc i32 %shr.26 to i8 + store i8 %conv4, i8* %arrayidx2, align 1 + %arrayidx8 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %2 = load i8, i8* %arrayidx8, align 1 + %conv9 = zext i8 %2 to i32 + %mul10 = mul nuw nsw i32 %conv9, %conv + %shr11.27 = lshr i32 %mul10, 8 + %conv12 = trunc i32 %shr11.27 to i8 + store i8 %conv12, i8* %arrayidx8, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll new file mode 100644 index 000000000000..a0e741a3cdbe --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnueabi" + +@AB = common global [1024 x i8] zeroinitializer, align 4 +@CD = common global [1024 x i8] zeroinitializer, align 4 + +define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +entry: + br label %for.body + +; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved +; access group is 2. + +; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 +; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv + %tmp = load i8, i8* %arrayidx0, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 + %tmp2 = load i8, i8* %arrayidx1, align 4 + %add = add nsw i8 %tmp, %C + %mul = mul nsw i8 %tmp2, %D + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv + store i8 %add, i8* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 + store i8 %mul, i8* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll new file mode 100644 index 000000000000..eee310491805 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -0,0 +1,243 @@ +; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: @add_a( +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add nuw nsw <16 x i8> +; CHECK: store <16 x i8> +; Function Attrs: nounwind +define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i8 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv1, i8* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_b( +; CHECK: load <8 x i16>, <8 x i16>* +; CHECK: add nuw nsw <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp9 = icmp sgt i32 %len, 0 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv8 = zext i16 %0 to i32 + %add = add nuw nsw i32 %conv8, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_c( +; CHECK: load <8 x i8>, <8 x i8>* +; CHECK: add nuw nsw <8 x i16> +; CHECK: store <8 x i16> +; Function Attrs: nounwind +define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp8 = icmp sgt i32 %len, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = add nuw nsw i32 %conv, 2 + %conv1 = trunc i32 %add to i16 + %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv + store i16 %conv1, i16* %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_d( +; CHECK: load <4 x i16> +; CHECK: add nsw <4 x i32> +; CHECK: store <4 x i32> +define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = add nsw i32 %conv, 2 + %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv + store i32 %add, i32* %arrayidx2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_e( +; CHECK: load <16 x i8> +; CHECK: shl <16 x i8> +; CHECK: add nuw nsw <16 x i8> +; CHECK: or <16 x i8> +; CHECK: mul nuw nsw <16 x i8> +; CHECK: and <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: mul nuw nsw <16 x i8> +; CHECK: store <16 x i8> +define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i32 + %add = shl i32 %conv, 4 + %conv2 = add nuw nsw i32 %add, 32 + %or = or i32 %conv, 51 + %mul = mul nuw nsw i32 %or, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_f +; CHECK: load <8 x i16> +; CHECK: trunc <8 x i16> +; CHECK: shl <8 x i8> +; CHECK: add nsw <8 x i8> +; CHECK: or <8 x i8> +; CHECK: mul nuw nsw <8 x i8> +; CHECK: and <8 x i8> +; CHECK: xor <8 x i8> +; CHECK: mul nuw nsw <8 x i8> +; CHECK: store <8 x i8> +define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { +entry: + %cmp.32 = icmp sgt i32 %len, 0 + br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv11 = zext i8 %arg2 to i32 + %conv13 = zext i8 %arg1 to i32 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv + %0 = load i16, i16* %arrayidx + %conv = sext i16 %0 to i32 + %add = shl i32 %conv, 4 + %conv2 = add nsw i32 %add, 32 + %or = and i32 %conv, 204 + %conv8 = or i32 %or, 51 + %mul = mul nuw nsw i32 %conv8, 60 + %and = and i32 %conv2, %conv13 + %mul.masked = and i32 %mul, 252 + %conv17 = xor i32 %mul.masked, %conv11 + %mul18 = mul nuw nsw i32 %conv17, %and + %conv19 = trunc i32 %mul18 to i8 + %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + store i8 %conv19, i8* %arrayidx21 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @add_g +; CHECK: load <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: icmp ult <16 x i8> +; CHECK: select <16 x i1> {{.*}}, <16 x i8> +; CHECK: store <16 x i8> +define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 { + %1 = icmp sgt i32 %len, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0 + %2 = sext i8 %arg1 to i64 + br label %3 + +._crit_edge: ; preds = %3, %0 + ret void + +; <label>:3 ; preds = %3, %.lr.ph + %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ] + %x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv + %x5 = load i8, i8* %x4 + %x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv + %x8 = load i8, i8* %x7 + %x9 = zext i8 %x5 to i32 + %x10 = xor i32 %x9, 255 + %x11 = icmp ult i32 %x10, 24 + %x12 = select i1 %x11, i32 %x10, i32 24 + %x13 = trunc i32 %x12 to i8 + store i8 %x13, i8* %x4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %len + br i1 %exitcond, label %._crit_edge, label %3 +} + +attributes #0 = { nounwind } diff --git a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll new file mode 100644 index 000000000000..be08a63b212c --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -0,0 +1,191 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @reduction_i8 +; +; char reduction_i8(char *a, char *b, int n) { +; char sum = 0; +; for (int i = 0; i < n; ++i) +; sum += (a[i] + b[i]); +; return sum; +; } +; +; CHECK: vector.body: +; CHECK: phi <16 x i8> +; CHECK: load <16 x i8> +; CHECK: load <16 x i8> +; CHECK: add <16 x i8> +; CHECK: add <16 x i8> +; +; CHECK: middle.block: +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <16 x i8> +; CHECK: zext i8 [[Rdx]] to i32 +; +define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { +entry: + %cmp.12 = icmp sgt i32 %n, 0 + br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %add5.lcssa = phi i32 [ %add5, %for.body ] + %conv6 = trunc i32 %add5.lcssa to i8 + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i8 %sum.0.lcssa + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %conv4 = and i32 %sum.013, 255 + %add = add nuw nsw i32 %conv, %conv4 + %add5 = add nuw nsw i32 %add, %conv3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} + +; CHECK-LABEL: @reduction_i16_1 +; +; short reduction_i16_1(short *a, short *b, int n) { +; short sum = 0; +; for (int i = 0; i < n; ++i) +; sum += (a[i] + b[i]); +; return sum; +; } +; +; CHECK: vector.body: +; CHECK: phi <8 x i16> +; CHECK: load <8 x i16> +; CHECK: load <8 x i16> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; +; CHECK: middle.block: +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> +; CHECK: zext i16 [[Rdx]] to i32 +; +define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) { +entry: + %cmp.16 = icmp sgt i32 %n, 0 + br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %add5.lcssa = phi i32 [ %add5, %for.body ] + %conv6 = trunc i32 %add5.lcssa to i16 + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i16 %sum.0.lcssa + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %sum.017 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %a, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %conv.14 = zext i16 %0 to i32 + %arrayidx2 = getelementptr inbounds i16, i16* %b, i64 %indvars.iv + %1 = load i16, i16* %arrayidx2, align 2 + %conv3.15 = zext i16 %1 to i32 + %conv4.13 = and i32 %sum.017, 65535 + %add = add nuw nsw i32 %conv.14, %conv4.13 + %add5 = add nuw nsw i32 %add, %conv3.15 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} + +; CHECK-LABEL: @reduction_i16_2 +; +; short reduction_i16_2(char *a, char *b, int n) { +; short sum = 0; +; for (int i = 0; i < n; ++i) +; sum += (a[i] + b[i]); +; return sum; +; } +; +; CHECK: vector.body: +; CHECK: phi <8 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> +; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> +; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> +; CHECK: add <8 x i16> +; CHECK: add <8 x i16> +; +; CHECK: middle.block: +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> +; CHECK: zext i16 [[Rdx]] to i32 +; +define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { +entry: + %cmp.14 = icmp sgt i32 %n, 0 + br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %add5.lcssa = phi i32 [ %add5, %for.body ] + %conv6 = trunc i32 %add5.lcssa to i16 + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i16 %sum.0.lcssa + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %conv4.13 = and i32 %sum.015, 65535 + %add = add nuw nsw i32 %conv, %conv4.13 + %add5 = add nuw nsw i32 %add, %conv3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll new file mode 100644 index 000000000000..de3626b57d83 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "armv8--linux-gnueabihf" + +@AB = common global [1024 x i8] zeroinitializer, align 4 +@CD = common global [1024 x i8] zeroinitializer, align 4 + +define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +entry: + br label %for.body + +; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved +; access group is 2. + +; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 +; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv + %tmp = load i8, i8* %arrayidx0, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 + %tmp2 = load i8, i8* %arrayidx1, align 4 + %add = add nsw i8 %tmp, %C + %mul = mul nsw i8 %tmp2, %D + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv + store i8 %add, i8* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 + store i8 %mul, i8* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} diff --git a/test/Transforms/LoopVectorize/ARM/vector_cast.ll b/test/Transforms/LoopVectorize/ARM/vector_cast.ll new file mode 100644 index 000000000000..78af9960e064 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/vector_cast.ll @@ -0,0 +1,37 @@ +; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7--linux-gnueabi" + +; This requires the loop vectorizer to create an interleaved access group +; for the stores to the struct. Here we need to perform a bitcast from a vector +; of pointers to a vector i32s. + +%class.A = type { i8*, i32 } + +; CHECK-LABEL: test0 +define void @test0(%class.A* %StartPtr, %class.A* %APtr) { +entry: + br label %for.body.i + +for.body.i: + %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ] + %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0 + store i8* null, i8** %Data.i.i, align 4, !tbaa !8 + %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1 + store i32 0, i32* %Length.i.i, align 4, !tbaa !11 + %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1 + %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr + br i1 %cmp.i, label %exit, label %for.body.i + +exit: + ret void +} + +!5 = !{!"any pointer", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9, !5, i64 0} +!9 = !{!5, i64 0, !10, i64 4} +!10 = !{!"int", !6, i64 0} +!11 = !{!9, !10, i64 4} diff --git a/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll b/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll new file mode 100644 index 000000000000..3491e08bbaa2 --- /dev/null +++ b/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll @@ -0,0 +1,40 @@ +; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %c) #0 { +entry: + br label %for.body + +; CHECK-LABEL: @foo +; CHECK: fmul <4 x double> %{{[^,]+}}, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> +; CHECK-NEXT: fmul <4 x double> %{{[^,]+}}, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %mul = fmul double %0, 2.000000e+00 + %mul3 = fmul double %0, %mul + %arrayidx5 = getelementptr inbounds double, double* %c, i64 %indvars.iv + %1 = load double, double* %arrayidx5, align 8 + %mul6 = fmul double %1, 3.000000e+00 + %mul9 = fmul double %1, %mul6 + %add = fadd double %mul3, %mul9 + %mul12 = fmul double %0, 4.000000e+00 + %mul15 = fmul double %mul12, %1 + %add16 = fadd double %mul15, %add + %add17 = fadd double %add16, 1.000000e+00 + %arrayidx19 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add17, double* %arrayidx19, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind "target-cpu"="a2q" } + diff --git a/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll b/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll new file mode 100644 index 000000000000..0cb845520246 --- /dev/null +++ b/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { +entry: + br label %for.body + +; CHECK-LABEL: @foo +; CHECK: <2 x double> + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds double, double* %b, i64 %0 + %1 = load double, double* %arrayidx, align 8 + %add = fadd double %1, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind "target-cpu"="pwr8" } + diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 8c375ccfd315..abe7d6de3f35 100644 --- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -499,4 +499,146 @@ for.end: ; preds = %for.cond ret void } +; void foo7 (double * __restrict__ out, double ** __restrict__ in, +; bool * __restrict__ trigger, unsigned size) { +; +; for (unsigned i=0; i<size; i++) +; if (trigger[i] && (in[i] != 0)) +; out[i] = (double) 0.5; +; } + +;AVX512-LABEL: @foo7 +;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64(<8 x double*>* +;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: ret void + +define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 { +entry: + %out.addr = alloca double*, align 8 + %in.addr = alloca double**, align 8 + %trigger.addr = alloca i8*, align 8 + %size.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store double* %out, double** %out.addr, align 8 + store double** %in, double*** %in.addr, align 8 + store i8* %trigger, i8** %trigger.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %size.addr, align 4 + %cmp = icmp ult i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = zext i32 %2 to i64 + %3 = load i8*, i8** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom + %4 = load i8, i8* %arrayidx, align 1 + %tobool = trunc i8 %4 to i1 + br i1 %tobool, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %idxprom1 = zext i32 %5 to i64 + %6 = load double**, double*** %in.addr, align 8 + %arrayidx2 = getelementptr inbounds double*, double** %6, i64 %idxprom1 + %7 = load double*, double** %arrayidx2, align 8 + %cmp3 = icmp ne double* %7, null + br i1 %cmp3, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + %8 = load i32, i32* %i, align 4 + %idxprom4 = zext i32 %8 to i64 + %9 = load double*, double** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4 + store double 5.000000e-01, double* %arrayidx5, align 8 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %10 = load i32, i32* %i, align 4 + %inc = add i32 %10, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +;typedef int (*fp)(); +;void foo8 (double* __restrict__ out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) { +; +; for (unsigned i=0; i<size; i++) +; if (trigger[i] && (in[i] != 0)) +; out[i] = (double) 0.5; +;} + +;AVX512-LABEL: @foo8 +;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f(<8 x i32 ()*>* % +;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: ret void + +define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 { +entry: + %out.addr = alloca double*, align 8 + %in.addr = alloca i32 ()**, align 8 + %trigger.addr = alloca i8*, align 8 + %size.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store double* %out, double** %out.addr, align 8 + store i32 ()** %in, i32 ()*** %in.addr, align 8 + store i8* %trigger, i8** %trigger.addr, align 8 + store i32 %size, i32* %size.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %size.addr, align 4 + %cmp = icmp ult i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %idxprom = zext i32 %2 to i64 + %3 = load i8*, i8** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom + %4 = load i8, i8* %arrayidx, align 1 + %tobool = trunc i8 %4 to i1 + br i1 %tobool, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %idxprom1 = zext i32 %5 to i64 + %6 = load i32 ()**, i32 ()*** %in.addr, align 8 + %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %6, i64 %idxprom1 + %7 = load i32 ()*, i32 ()** %arrayidx2, align 8 + %cmp3 = icmp ne i32 ()* %7, null + br i1 %cmp3, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + %8 = load i32, i32* %i, align 4 + %idxprom4 = zext i32 %8 to i64 + %9 = load double*, double** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4 + store double 5.000000e-01, double* %arrayidx5, align 8 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %10 = load i32, i32* %i, align 4 + %inc = add i32 %10, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll index ba8e11e58749..74c0c16086fe 100644 --- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -60,7 +60,7 @@ for.body: ; preds = %for.body, %entry %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 for.end: ; preds = %for.body @@ -111,7 +111,7 @@ for.body: ; preds = %for.body, %entry %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body @@ -162,7 +162,7 @@ for.body: ; preds = %for.body, %entry %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 for.end: ; preds = %for.body diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/test/Transforms/LoopVectorize/X86/no_fpmath.ll new file mode 100644 index 000000000000..0bb78ce177fe --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/no_fpmath.ll @@ -0,0 +1,104 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s + +; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations +; CHECK: remark: no_fpmath.c:6:14: loop not vectorized: +; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; Function Attrs: nounwind readonly ssp uwtable +define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 { +entry: + %cmp.7 = icmp sgt i32 %n, 0, !dbg !3 + br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !9 + +for.cond.cleanup.loopexit: ; preds = %for.body + %add.lcssa = phi double [ %add, %for.body ] + br label %for.cond.cleanup, !dbg !10 + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ] + ret double %a.0.lcssa, !dbg !10 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9 + %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11 + %cmp1 = icmp eq i32 %0, 0, !dbg !15 + %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9 + %add = fadd double %a.08, %cond, !dbg !16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8 + %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8 + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17 +} + +; Function Attrs: nounwind readonly ssp uwtable +define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 { +entry: + %cmp.7 = icmp sgt i32 %n, 0, !dbg !19 + br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !22 + +for.cond.cleanup.loopexit: ; preds = %for.body + %add.lcssa = phi double [ %add, %for.body ] + br label %for.cond.cleanup, !dbg !23 + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ] + ret double %a.0.lcssa, !dbg !23 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22 + %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11 + %cmp1 = icmp eq i32 %0, 0, !dbg !24 + %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22 + %add = fadd double %a.08, %cond, !dbg !25 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21 + %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21 + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26 +} + +attributes #0 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 1, !"PIC Level", i32 2} +!2 = !{!"clang version 3.7.0"} +!3 = !DILocation(line: 5, column: 20, scope: !4) +!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, variables: !7) +!5 = !DIFile(filename: "no_fpmath.c", directory: "") +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 5, column: 3, scope: !4) +!9 = !DILocation(line: 6, column: 14, scope: !4) +!10 = !DILocation(line: 9, column: 3, scope: !4) +!11 = !{!12, !12, i64 0} +!12 = !{!"int", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} +!15 = !DILocation(line: 6, column: 19, scope: !4) +!16 = !DILocation(line: 6, column: 11, scope: !4) +!17 = distinct !{!17, !18} +!18 = !{!"llvm.loop.unroll.disable"} +!19 = !DILocation(line: 16, column: 20, scope: !20) +!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, variables: !7) +!21 = !DILocation(line: 16, column: 3, scope: !20) +!22 = !DILocation(line: 17, column: 14, scope: !20) +!23 = !DILocation(line: 20, column: 3, scope: !20) +!24 = !DILocation(line: 17, column: 19, scope: !20) +!25 = !DILocation(line: 17, column: 11, scope: !20) +!26 = distinct !{!26, !27, !18} +!27 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/test/Transforms/LoopVectorize/X86/powof2div.ll b/test/Transforms/LoopVectorize/X86/powof2div.ll index 6bc738a7d143..3e4bef6d4d07 100644 --- a/test/Transforms/LoopVectorize/X86/powof2div.ll +++ b/test/Transforms/LoopVectorize/X86/powof2div.ll @@ -6,10 +6,10 @@ target triple = "x86_64-unknown-linux-gnu" @Foo = common global %struct.anon zeroinitializer, align 4 -;CHECK-LABEL: @foo( -;CHECK: load <4 x i32>, <4 x i32>* -;CHECK: sdiv <4 x i32> -;CHECK: store <4 x i32> +; CHECK-LABEL: @foo( +; CHECK: load <4 x i32>, <4 x i32>* +; CHECK: sdiv <4 x i32> +; CHECK: store <4 x i32> define void @foo(){ entry: diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll index 3741b95d9859..6393002d5071 100644 --- a/test/Transforms/LoopVectorize/X86/reduction-crash.ll +++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -loop-vectorize -mcpu=prescott < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -mcpu=prescott -disable-basicaa < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" target triple = "i386-apple-darwin" diff --git a/test/Transforms/LoopVectorize/X86/reg-usage.ll b/test/Transforms/LoopVectorize/X86/reg-usage.ll new file mode 100644 index 000000000000..47a6e1029eda --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -0,0 +1,71 @@ +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @foo() { +; This function has a loop of SAD pattern. Here we check when VF = 16 the +; register usage doesn't exceed 16. +; +; CHECK-LABEL: foo +; CHECK: LV(REG): VF = 4 +; CHECK-NEXT: LV(REG): Found max usage: 4 +; CHECK: LV(REG): VF = 8 +; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max usage: 13 + +entry: + br label %for.body + +for.cond.cleanup: + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %sub = sub nsw i32 %conv, %conv3 + %ispos = icmp sgt i32 %sub, -1 + %neg = sub nsw i32 0, %sub + %2 = select i1 %ispos, i32 %sub, i32 %neg + %add = add nsw i32 %2, %s.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define i64 @bar(i64* nocapture %a) { +; CHECK-LABEL: bar +; CHECK: LV(REG): VF = 2 +; CHECK: LV(REG): Found max usage: 4 +; +entry: + br label %for.body + +for.cond.cleanup: + %add2.lcssa = phi i64 [ %add2, %for.body ] + ret i64 %add2.lcssa + +for.body: + %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.012 + store i64 %add, i64* %arrayidx, align 8 + %add2 = add nsw i64 %add, %s.011 + %inc = add nuw nsw i64 %i.012, 1 + %exitcond = icmp eq i64 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} diff --git a/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll new file mode 100644 index 000000000000..fe9d59efc8b3 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll @@ -0,0 +1,46 @@ +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global [1000 x i8] zeroinitializer, align 16 +@b = global [1000 x i8] zeroinitializer, align 16 +@c = global [1000 x i8] zeroinitializer, align 16 +@u = global [1000 x i32] zeroinitializer, align 16 +@v = global [1000 x i32] zeroinitializer, align 16 +@w = global [1000 x i32] zeroinitializer, align 16 + +; Tests that the vectorization factor is determined by the smallest instead of +; widest type in the loop for maximum bandwidth when +; -vectorizer-maximize-bandwidth is indicated. +; +; CHECK-label: foo +; CHECK: LV: Selecting VF: 32. +define void @foo() { +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv + store i8 %add, i8* %arrayidx6, align 1 + %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx8, align 4 + %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv + store i32 %add11, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll index 6cd3c9c3bc01..cca829b9457e 100644 --- a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -17,7 +17,7 @@ target triple = "x86_64-apple-macosx10.8.0" ; widest vector count. ; ; CHECK: test_consecutive_store -; CHECK: The Widest type: 64 bits +; CHECK: The Smallest and Widest types: 64 / 64 bits. define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { %4 = load %0*, %0** %2, align 8 %5 = icmp eq %0** %0, %1 @@ -51,7 +51,7 @@ define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwt ; p[i][y] = (int*) (1 + q[i]); ; } ; CHECK: test_nonconsecutive_store -; CHECK: The Widest type: 16 bits +; CHECK: The Smallest and Widest types: 16 / 16 bits. define void @test_nonconsecutive_store() nounwind ssp uwtable { br label %1 @@ -93,7 +93,7 @@ define void @test_nonconsecutive_store() nounwind ssp uwtable { ;; Now we check the same rules for loads. We should take consecutive loads of ;; pointer types into account. ; CHECK: test_consecutive_ptr_load -; CHECK: The Widest type: 64 bits +; CHECK: The Smallest and Widest types: 8 / 64 bits. define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { br label %1 @@ -117,7 +117,7 @@ define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { ;; However, we should not take unconsecutive loads of pointers into account. ; CHECK: test_nonconsecutive_ptr_load -; CHECK: The Widest type: 16 bits +; CHECK: LV: The Smallest and Widest types: 16 / 16 bits. define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { br label %1 diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll index 65cabb05f2fb..02fab4447341 100644 --- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll @@ -25,7 +25,7 @@ ; File, line, and column should match those specified in the metadata ; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations ; CHECK: remark: source.cpp:4:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info -; CHECK: remark: source.cpp:13:5: loop not vectorized: vector width and interleave count are explicitly set to 1 +; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1 ; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds ; CHECK: remark: source.cpp:19:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info ; CHECK: warning: source.cpp:19:5: loop not vectorized: failed explicitly specified loop vectorization @@ -45,7 +45,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind optsize ssp uwtable -define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 { +define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 { entry: %cmp10 = icmp sgt i32 %Length, 0, !dbg !12 br i1 %cmp10, label %for.body, label %for.end, !dbg !12, !llvm.loop !14 @@ -67,7 +67,7 @@ for.end: ; preds = %for.body, %entry } ; Function Attrs: nounwind optsize ssp uwtable -define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 { +define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 !dbg !7 { entry: %cmp4 = icmp sgt i32 %Length, 0, !dbg !25 br i1 %cmp4, label %for.body, label %for.end, !dbg !25, !llvm.loop !27 @@ -87,7 +87,7 @@ for.end: ; preds = %for.body, %entry } ; Function Attrs: nounwind optsize ssp uwtable -define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 { +define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 !dbg !8 { entry: %cmp9 = icmp sgt i32 %Length, 0, !dbg !32 br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34 @@ -122,15 +122,15 @@ attributes #0 = { nounwind } !llvm.module.flags = !{!9, !10} !llvm.ident = !{!11} -!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "source.cpp", directory: ".") !2 = !{} !3 = !{!4, !7, !8} -!4 = !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, function: void (i32*, i32)* @_Z4testPii, variables: !2) +!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2) !5 = !DIFile(filename: "source.cpp", directory: ".") !6 = !DISubroutineType(types: !2) -!7 = !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !1, scope: !5, type: !6, function: void (i32*, i32)* @_Z13test_disabledPii, variables: !2) -!8 = !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !1, scope: !5, type: !6, function: void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, variables: !2) +!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !1, scope: !5, type: !6, variables: !2) +!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !1, scope: !5, type: !6, variables: !2) !9 = !{i32 2, !"Dwarf Version", i32 2} !10 = !{i32 2, !"Debug Info Version", i32 3} !11 = !{!"clang version 3.5.0"} diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll new file mode 100644 index 000000000000..df8c668f1262 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll @@ -0,0 +1,113 @@ +; RUN: opt < %s -loop-vectorize -pass-remarks-analysis='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s + +; Verify analysis remarks are generated when interleaving is not beneficial. +; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial +; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that interleaving is not beneficial and is explicitly disabled or interleave count is set to 1 +; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that vectorization is not beneficial +; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that interleaving is not beneficial + +; First loop. +; #pragma clang loop interleave(disable) unroll(disable) +; for(int i = 0; i < n; i++) { +; out[i] = *in[i]; +; } + +; Second loop. +; #pragma clang loop unroll(disable) +; for(int i = 0; i < n; i++) { +; out[i] = *in[i]; +; } + +; ModuleID = 'vectorization-remarks-profitable.ll' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; Function Attrs: nounwind uwtable +define void @do_not_interleave(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !4 { +entry: + %cmp.4 = icmp eq i32 %size, 0, !dbg !10 + br i1 %cmp.4, label %for.end, label %for.body.preheader, !dbg !11 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !12 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !12 + %0 = bitcast float** %arrayidx to i32**, !dbg !12 + %1 = load i32*, i32** %0, align 8, !dbg !12 + %2 = load i32, i32* %1, align 4, !dbg !13 + %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !14 + %3 = bitcast float* %arrayidx2 to i32*, !dbg !15 + store i32 %2, i32* %3, align 4, !dbg !15 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !11 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !11 + %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !11 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !11, !llvm.loop !16 + +for.end.loopexit: ; preds = %for.body + br label %for.end, !dbg !19 + +for.end: ; preds = %for.end.loopexit, %entry + ret void, !dbg !19 +} + +; Function Attrs: nounwind uwtable +define void @interleave_not_profitable(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !6 { +entry: + %cmp.4 = icmp eq i32 %size, 0, !dbg !20 + br i1 %cmp.4, label %for.end, label %for.body, !dbg !21 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !22 + %0 = bitcast float** %arrayidx to i32**, !dbg !22 + %1 = load i32*, i32** %0, align 8, !dbg !22 + %2 = load i32, i32* %1, align 4, !dbg !23 + %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !24 + %3 = bitcast float* %arrayidx2 to i32*, !dbg !25 + store i32 %2, i32* %3, align 4, !dbg !25 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21 + %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !21 + br i1 %exitcond, label %for.end, label %for.body, !dbg !21, !llvm.loop !26 + +for.end: ; preds = %for.body, %entry + ret void, !dbg !27 +} + +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250016)", isOptimized: false, runtimeVersion: 0, emissionKind: 2, enums: !2, subprograms: !3) +!1 = !DIFile(filename: "vectorization-remarks-profitable.c", directory: "") +!2 = !{} +!3 = !{!4, !6} +!4 = distinct !DISubprogram(name: "do_not_interleave", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, variables: !2) +!5 = !DISubroutineType(types: !2) +!6 = distinct !DISubprogram(name: "interleave_not_profitable", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, variables: !2) +!7 = !{i32 2, !"Dwarf Version", i32 4} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{!"clang version 3.8.0 (trunk 250016)"} +!10 = !DILocation(line: 4, column: 23, scope: !4) +!11 = !DILocation(line: 4, column: 3, scope: !4) +!12 = !DILocation(line: 5, column: 17, scope: !4) +!13 = !DILocation(line: 5, column: 16, scope: !4) +!14 = !DILocation(line: 5, column: 7, scope: !4) +!15 = !DILocation(line: 5, column: 14, scope: !4) +!16 = distinct !{!16, !17, !18} +!17 = !{!"llvm.loop.interleave.count", i32 1} +!18 = !{!"llvm.loop.unroll.disable"} +!19 = !DILocation(line: 6, column: 1, scope: !4) +!20 = !DILocation(line: 11, column: 23, scope: !6) +!21 = !DILocation(line: 11, column: 3, scope: !6) +!22 = !DILocation(line: 12, column: 17, scope: !6) +!23 = !DILocation(line: 12, column: 16, scope: !6) +!24 = !DILocation(line: 12, column: 7, scope: !6) +!25 = !DILocation(line: 12, column: 14, scope: !6) +!26 = distinct !{!26, !18} +!27 = !DILocation(line: 13, column: 1, scope: !6) + diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll index 8640950be32e..77a405ebb434 100644 --- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll @@ -9,13 +9,13 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vector width and interleave count are explicitly set to 1 ; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) -; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved by 4 (vectorization not beneficial) +; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) +; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define i32 @foo(i32 %n) #0 { +define i32 @foo(i32 %n) #0 !dbg !4 { entry: %diff = alloca i32, align 4 %cb = alloca [16 x i8], align 16 @@ -52,7 +52,7 @@ declare void @ibar(i32*) #1 !1 = !DIFile(filename: "vectorization-remarks.c", directory: ".") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !5, type: !6, function: i32 (i32)* @foo, variables: !2) +!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !5, type: !6, variables: !2) !5 = !DIFile(filename: "vectorization-remarks.c", directory: ".") !6 = !DISubroutineType(types: !2) !7 = !{i32 2, !"Dwarf Version", i32 4} diff --git a/test/Transforms/LoopVectorize/conditional-assignment.ll b/test/Transforms/LoopVectorize/conditional-assignment.ll index f41f08df07a6..8d820e277b26 100644 --- a/test/Transforms/LoopVectorize/conditional-assignment.ll +++ b/test/Transforms/LoopVectorize/conditional-assignment.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" ; Function Attrs: nounwind ssp uwtable -define void @conditional_store(i32* noalias nocapture %indices) #0 { +define void @conditional_store(i32* noalias nocapture %indices) #0 !dbg !4 { entry: br label %for.body, !dbg !10 @@ -36,11 +36,11 @@ attributes #0 = { nounwind } !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} -!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0", isOptimized: true, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0", isOptimized: true, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "source.c", directory: ".") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "conditional_store", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, function: void (i32*)* @conditional_store, variables: !2) +!4 = distinct !DISubprogram(name: "conditional_store", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2) !5 = !DIFile(filename: "source.c", directory: ".") !6 = !DISubroutineType(types: !2) !7 = !{i32 2, !"Dwarf Version", i32 2} diff --git a/test/Transforms/LoopVectorize/control-flow.ll b/test/Transforms/LoopVectorize/control-flow.ll index c56f9122e462..a2fc69a6e907 100644 --- a/test/Transforms/LoopVectorize/control-flow.ll +++ b/test/Transforms/LoopVectorize/control-flow.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' 2>&1 | FileCheck %s ; C/C++ code for control flow test ; int test(int *A, int Length) { @@ -20,7 +20,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind optsize ssp uwtable -define i32 @_Z4testPii(i32* nocapture %A, i32 %Length) #0 { +define i32 @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 { entry: %cmp8 = icmp sgt i32 %Length, 0, !dbg !10 br i1 %cmp8, label %for.body.preheader, label %end, !dbg !10 @@ -55,11 +55,11 @@ attributes #0 = { nounwind } !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} -!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "source.cpp", directory: ".") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !5, type: !6, function: i32 (i32*, i32)* @_Z4testPii, variables: !2) +!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2) !5 = !DIFile(filename: "source.cpp", directory: ".") !6 = !DISubroutineType(types: !2) !7 = !{i32 2, !"Dwarf Version", i32 2} diff --git a/test/Transforms/LoopVectorize/dbg.value.ll b/test/Transforms/LoopVectorize/dbg.value.ll index c7440f84b2c9..f68b6865b072 100644 --- a/test/Transforms/LoopVectorize/dbg.value.ll +++ b/test/Transforms/LoopVectorize/dbg.value.ll @@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0" @C = global [1024 x i32] zeroinitializer, align 16 ; CHECK-LABEL: @test( -define i32 @test() #0 { +define i32 @test() #0 !dbg !3 { entry: tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !9, metadata !DIExpression()), !dbg !18 br label %for.body, !dbg !18 @@ -44,16 +44,16 @@ attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!26} -!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang", isOptimized: true, emissionKind: 0, file: !25, enums: !1, retainedTypes: !1, subprograms: !2, globals: !11) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang", isOptimized: true, emissionKind: 0, file: !25, enums: !1, retainedTypes: !1, subprograms: !2, globals: !11) !1 = !{} !2 = !{!3} -!3 = !DISubprogram(name: "test", linkageName: "test", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !25, scope: !4, type: !5, function: i32 ()* @test, variables: !8) +!3 = distinct !DISubprogram(name: "test", linkageName: "test", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !25, scope: !4, type: !5, variables: !8) !4 = !DIFile(filename: "test", directory: "/path/to/somewhere") !5 = !DISubroutineType(types: !6) !6 = !{!7} !7 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed) !8 = !{!9} -!9 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 6, scope: !10, file: !4, type: !7) +!9 = !DILocalVariable(name: "i", line: 6, scope: !10, file: !4, type: !7) !10 = distinct !DILexicalBlock(line: 6, column: 0, file: !25, scope: !3) !11 = !{!12, !16, !17} !12 = !DIGlobalVariable(name: "A", line: 1, isLocal: false, isDefinition: true, scope: null, file: !4, type: !13, variable: [1024 x i32]* @A) diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll index e691afdd6933..0214f1c4847c 100644 --- a/test/Transforms/LoopVectorize/debugloc.ll +++ b/test/Transforms/LoopVectorize/debugloc.ll @@ -12,12 +12,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK: load <2 x i32>, <2 x i32>* {{.*}}, !dbg ![[LOC2]] ; CHECK: add <2 x i32> {{.*}}, !dbg ![[LOC2]] ; CHECK: add i64 %index, 2, !dbg ![[LOC]] -; CHECK: icmp eq i64 %index.next, %end.idx.rnd.down, !dbg ![[LOC]] +; CHECK: icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]] ; CHECK: middle.block -; CHECK: add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg ![[LOC2]] +; CHECK: add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[LOC2]] ; CHECK: extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC2]] -define i32 @f(i32* nocapture %a, i32 %size) #0 { +define i32 @f(i32* nocapture %a, i32 %size) #0 !dbg !4 { entry: tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !13, metadata !DIExpression()), !dbg !19 tail call void @llvm.dbg.value(metadata i32 %size, i64 0, metadata !14, metadata !DIExpression()), !dbg !19 @@ -63,11 +63,11 @@ attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!18, !27} -!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 185038) (llvm/trunk 185097)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 185038) (llvm/trunk 185097)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "-", directory: "/Volumes/Data/backedup/dev/os/llvm/debug") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "f", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !5, scope: !6, type: !7, function: i32 (i32*, i32)* @f, variables: !12) +!4 = distinct !DISubprogram(name: "f", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !5, scope: !6, type: !7, variables: !12) !5 = !DIFile(filename: "<stdin>", directory: "/Volumes/Data/backedup/dev/os/llvm/debug") !6 = !DIFile(filename: "<stdin>", directory: "/Volumes/Data/backedup/dev/os/llvm/debug") !7 = !DISubroutineType(types: !8) @@ -76,10 +76,10 @@ attributes #1 = { nounwind readnone } !10 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9) !11 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned) !12 = !{!13, !14, !15, !16} -!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "a", line: 3, arg: 1, scope: !4, file: !6, type: !10) -!14 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "size", line: 3, arg: 2, scope: !4, file: !6, type: !11) -!15 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "sum", line: 4, scope: !4, file: !6, type: !11) -!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "i", line: 5, scope: !17, file: !6, type: !11) +!13 = !DILocalVariable(name: "a", line: 3, arg: 1, scope: !4, file: !6, type: !10) +!14 = !DILocalVariable(name: "size", line: 3, arg: 2, scope: !4, file: !6, type: !11) +!15 = !DILocalVariable(name: "sum", line: 4, scope: !4, file: !6, type: !11) +!16 = !DILocalVariable(name: "i", line: 5, scope: !17, file: !6, type: !11) !17 = distinct !DILexicalBlock(line: 5, column: 0, file: !5, scope: !4) !18 = !{i32 2, !"Dwarf Version", i32 3} !19 = !DILocation(line: 3, scope: !4) diff --git a/test/Transforms/LoopVectorize/gep_with_bitcast.ll b/test/Transforms/LoopVectorize/gep_with_bitcast.ll new file mode 100644 index 000000000000..ab2fd5e4e1c6 --- /dev/null +++ b/test/Transforms/LoopVectorize/gep_with_bitcast.ll @@ -0,0 +1,40 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorization of loop with bitcast between GEP and load +; Simplified source code: +;void foo (double** __restrict__ in, bool * __restrict__ res) { +; +; for (int i = 0; i < 4096; ++i) +; res[i] = ((unsigned long long)in[i] == 0); +;} + +; CHECK-LABEL: @foo +; CHECK: vector.body +; CHECK: %0 = getelementptr inbounds double*, double** %in, i64 %index +; CHECK: %1 = bitcast double** %0 to <4 x i64>* +; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %1, align 8 +; CHECK: %2 = icmp eq <4 x i64> %wide.load, zeroinitializer +; CHECK: br i1 + +define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double*, double** %in, i64 %indvars.iv + %tmp53 = bitcast double** %arrayidx to i64* + %tmp54 = load i64, i64* %tmp53, align 8 + %cmp1 = icmp eq i64 %tmp54, 0 + %arrayidx3 = getelementptr inbounds i8, i8* %res, i64 %indvars.iv + %frombool = zext i1 %cmp1 to i8 + store i8 %frombool, i8* %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +}
\ No newline at end of file diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll index 991d027ada5c..0d70f557f834 100644 --- a/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -1,5 +1,8 @@ -; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL -; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -simplifycfg < %s | FileCheck %s --check-prefix=VEC +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC + target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" @@ -14,27 +17,49 @@ entry: ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true> ; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 ; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true +; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: -; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 -; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 ; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 ; VEC: br label %[[else:.+]] ; ; VEC: [[else]]: ; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 ; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true +; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: -; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 -; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 ; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 ; VEC: br label %[[else2:.+]] ; ; VEC: [[else2]]: +; VEC-IC-LABEL: test +; VEC-IC: %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100> +; VEC-IC: %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20> +; VEC-IC: %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0 +; VEC-IC: br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]] +; +; VEC-IC: [[cond]]: +; VEC-IC: %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0 +; VEC-IC: store i32 %[[v4]], i32* %{{.*}}, align 4 +; VEC-IC: br label %[[else:.+]] +; +; VEC-IC: [[else]]: +; VEC-IC: %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1 +; VEC-IC: br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]] +; +; VEC-IC: [[cond2]]: +; VEC-IC: %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1 +; VEC-IC: store i32 %[[v6]], i32* %{{.*}}, align 4 +; VEC-IC: br label %[[else2:.+]] +; +; VEC-IC: [[else2]]: + ; UNROLL-LABEL: test ; UNROLL: vector.body: ; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0 @@ -90,9 +115,9 @@ for.end: ; vectorized loop body. ; PR18724 -; UNROLL-LABEL: bug18724 -; UNROLL: store i32 -; UNROLL: store i32 +; UNROLL-NOSIMPLIFY-LABEL: bug18724 +; UNROLL-NOSIMPLIFY: store i32 +; UNROLL-NOSIMPLIFY: store i32 define void @bug18724() { entry: diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 2fbb2de797ae..59ee66a4a35d 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -6,8 +6,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-LABEL: @multi_int_induction( ; CHECK: vector.body: ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %normalized.idx = sub i64 %index, 0 -; CHECK: %[[VAR:.*]] = trunc i64 %normalized.idx to i32 +; CHECK: %[[VAR:.*]] = trunc i64 %index to i32 ; CHECK: %offset.idx = add i32 190, %[[VAR]] define void @multi_int_induction(i32* %A, i32 %N) { for.body.lr.ph: @@ -113,12 +112,11 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; condition and branch directly to the scalar loop. ; CHECK-LABEL: max_i32_backedgetaken -; CHECK: %backedge.overflow = icmp eq i32 -1, -1 -; CHECK: br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked +; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked ; CHECK: scalar.ph: -; CHECK: %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ] -; CHECK: %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ] +; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ] +; CHECK: %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ] define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { @@ -142,11 +140,10 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { ; CHECK-LABEL: testoverflowcheck ; CHECK: entry ; CHECK: %[[LOAD:.*]] = load i8 -; CHECK: %[[VAL:.*]] = zext i8 %[[LOAD]] to i32 ; CHECK: br ; CHECK: scalar.ph -; CHECK: phi i32 [ %{{.*}}, %middle.block ], [ %[[VAL]], %entry ] +; CHECK: phi i8 [ %{{.*}}, %middle.block ], [ %[[LOAD]], %entry ] @e = global i8 1, align 1 @d = common global i32 0, align 4 diff --git a/test/Transforms/LoopVectorize/miniters.ll b/test/Transforms/LoopVectorize/miniters.ll new file mode 100644 index 000000000000..81cb2d4ca5a1 --- /dev/null +++ b/test/Transforms/LoopVectorize/miniters.ll @@ -0,0 +1,45 @@ +; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s +; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s -check-prefix=UNROLL + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global [1000 x i32] zeroinitializer, align 16 +@c = common global [1000 x i32] zeroinitializer, align 16 +@a = common global [1000 x i32] zeroinitializer, align 16 + +; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF. +; CHECK-LABEL: foo( +; CHECK: %min.iters.check = icmp ult i64 %N, 4 +; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; UNROLL-LABEL: foo( +; UNROLL: %min.iters.check = icmp ult i64 %N, 8 +; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked + +define void @foo(i64 %N) { +entry: + %cmp.8 = icmp sgt i64 %N, 0 + br i1 %cmp.8, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %i.09 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %i.09 + %tmp = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds [1000 x i32], [1000 x i32]* @c, i64 0, i64 %i.09 + %tmp1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %tmp1, %tmp + %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %i.09 + store i32 %add, i32* %arrayidx2, align 4 + %inc = add nuw nsw i64 %i.09, 1 + %exitcond = icmp eq i64 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll index 5a0356fe11a2..19a401213fd5 100644 --- a/test/Transforms/LoopVectorize/minmax_reduction.ll +++ b/test/Transforms/LoopVectorize/minmax_reduction.ll @@ -412,10 +412,10 @@ for.end: ; Turn this into a max reduction in the presence of a no-nans-fp-math attribute. ; CHECK-LABEL: @max_red_float( -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @max_red_float(float %max) #0 { @@ -427,7 +427,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ogt float %0, %max.red.08 + %cmp3 = fcmp fast ogt float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %0, float %max.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -438,10 +438,10 @@ for.end: } ; CHECK-LABEL: @max_red_float_ge( -; CHECK: fcmp oge <2 x float> +; CHECK: fcmp fast oge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @max_red_float_ge(float %max) #0 { @@ -453,7 +453,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp oge float %0, %max.red.08 + %cmp3 = fcmp fast oge float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %0, float %max.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -464,10 +464,10 @@ for.end: } ; CHECK-LABEL: @inverted_max_red_float( -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @inverted_max_red_float(float %max) #0 { @@ -479,7 +479,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp olt float %0, %max.red.08 + %cmp3 = fcmp fast olt float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %max.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -490,10 +490,10 @@ for.end: } ; CHECK-LABEL: @inverted_max_red_float_le( -; CHECK: fcmp ole <2 x float> +; CHECK: fcmp fast ole <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @inverted_max_red_float_le(float %max) #0 { @@ -505,7 +505,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ole float %0, %max.red.08 + %cmp3 = fcmp fast ole float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %max.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -516,10 +516,10 @@ for.end: } ; CHECK-LABEL: @unordered_max_red_float( -; CHECK: fcmp ole <2 x float> +; CHECK: fcmp fast ole <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @unordered_max_red_float(float %max) #0 { @@ -531,7 +531,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ugt float %0, %max.red.08 + %cmp3 = fcmp fast ugt float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %0, float %max.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -542,10 +542,10 @@ for.end: } ; CHECK-LABEL: @unordered_max_red_float_ge( -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @unordered_max_red_float_ge(float %max) #0 { @@ -557,7 +557,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp uge float %0, %max.red.08 + %cmp3 = fcmp fast uge float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %0, float %max.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -568,10 +568,10 @@ for.end: } ; CHECK-LABEL: @inverted_unordered_max_red_float( -; CHECK: fcmp oge <2 x float> +; CHECK: fcmp fast oge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @inverted_unordered_max_red_float(float %max) #0 { @@ -583,7 +583,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ult float %0, %max.red.08 + %cmp3 = fcmp fast ult float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %max.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -594,10 +594,10 @@ for.end: } ; CHECK-LABEL: @inverted_unordered_max_red_float_le( -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select i1 define float @inverted_unordered_max_red_float_le(float %max) #0 { @@ -609,7 +609,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ule float %0, %max.red.08 + %cmp3 = fcmp fast ule float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %max.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -623,10 +623,10 @@ for.end: ; Turn this into a min reduction in the presence of a no-nans-fp-math attribute. ; CHECK-LABEL: @min_red_float( -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @min_red_float(float %min) #0 { @@ -638,7 +638,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp olt float %0, %min.red.08 + %cmp3 = fcmp fast olt float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %0, float %min.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -649,10 +649,10 @@ for.end: } ; CHECK-LABEL: @min_red_float_le( -; CHECK: fcmp ole <2 x float> +; CHECK: fcmp fast ole <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @min_red_float_le(float %min) #0 { @@ -664,7 +664,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ole float %0, %min.red.08 + %cmp3 = fcmp fast ole float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %0, float %min.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -675,10 +675,10 @@ for.end: } ; CHECK-LABEL: @inverted_min_red_float( -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @inverted_min_red_float(float %min) #0 { @@ -690,7 +690,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ogt float %0, %min.red.08 + %cmp3 = fcmp fast ogt float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %min.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -701,10 +701,10 @@ for.end: } ; CHECK-LABEL: @inverted_min_red_float_ge( -; CHECK: fcmp oge <2 x float> +; CHECK: fcmp fast oge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @inverted_min_red_float_ge(float %min) #0 { @@ -716,7 +716,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp oge float %0, %min.red.08 + %cmp3 = fcmp fast oge float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %min.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -727,10 +727,10 @@ for.end: } ; CHECK-LABEL: @unordered_min_red_float( -; CHECK: fcmp oge <2 x float> +; CHECK: fcmp fast oge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @unordered_min_red_float(float %min) #0 { @@ -742,7 +742,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ult float %0, %min.red.08 + %cmp3 = fcmp fast ult float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %0, float %min.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -753,10 +753,10 @@ for.end: } ; CHECK-LABEL: @unordered_min_red_float_le( -; CHECK: fcmp ogt <2 x float> +; CHECK: fcmp fast ogt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @unordered_min_red_float_le(float %min) #0 { @@ -768,7 +768,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ule float %0, %min.red.08 + %cmp3 = fcmp fast ule float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %0, float %min.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -779,10 +779,10 @@ for.end: } ; CHECK-LABEL: @inverted_unordered_min_red_float( -; CHECK: fcmp ole <2 x float> +; CHECK: fcmp fast ole <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @inverted_unordered_min_red_float(float %min) #0 { @@ -794,7 +794,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ugt float %0, %min.red.08 + %cmp3 = fcmp fast ugt float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %min.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -805,10 +805,10 @@ for.end: } ; CHECK-LABEL: @inverted_unordered_min_red_float_ge( -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x float> +; CHECK: fcmp fast olt <2 x float> ; CHECK: select i1 define float @inverted_unordered_min_red_float_ge(float %min) #0 { @@ -820,7 +820,7 @@ for.body: %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp uge float %0, %min.red.08 + %cmp3 = fcmp fast uge float %0, %min.red.08 %min.red.0 = select i1 %cmp3, float %min.red.08, float %0 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -832,10 +832,10 @@ for.end: ; Make sure we handle doubles, too. ; CHECK-LABEL: @min_red_double( -; CHECK: fcmp olt <2 x double> +; CHECK: fcmp fast olt <2 x double> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp olt <2 x double> +; CHECK: fcmp fast olt <2 x double> ; CHECK: select i1 define double @min_red_double(double %min) #0 { @@ -847,7 +847,7 @@ for.body: %min.red.08 = phi double [ %min, %entry ], [ %min.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x double], [1024 x double]* @dA, i64 0, i64 %indvars.iv %0 = load double, double* %arrayidx, align 4 - %cmp3 = fcmp olt double %0, %min.red.08 + %cmp3 = fcmp fast olt double %0, %min.red.08 %min.red.0 = select i1 %cmp3, double %0, double %min.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 @@ -871,7 +871,7 @@ for.body: %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ] %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv %0 = load float, float* %arrayidx, align 4 - %cmp3 = fcmp ogt float %0, %max.red.08 + %cmp3 = fcmp fast ogt float %0, %max.red.08 %max.red.0 = select i1 %cmp3, float %0, float %max.red.08 %indvars.iv.next = add i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 diff --git a/test/Transforms/LoopVectorize/no_array_bounds.ll b/test/Transforms/LoopVectorize/no_array_bounds.ll index f7c7ff7732b9..13cec71fc455 100644 --- a/test/Transforms/LoopVectorize/no_array_bounds.ll +++ b/test/Transforms/LoopVectorize/no_array_bounds.ll @@ -17,7 +17,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind ssp uwtable -define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 { +define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 !dbg !4 { entry: %cmp25 = icmp sgt i32 %number, 0, !dbg !10 br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12 @@ -72,11 +72,11 @@ attributes #0 = { nounwind } !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} -!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "no_array_bounds.cpp", directory: ".") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !5, type: !6, function: void (i32*, i32*, i32)* @_Z4testPiS_i, variables: !2) +!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2) !5 = !DIFile(filename: "no_array_bounds.cpp", directory: ".") !6 = !DISubroutineType(types: !2) !7 = !{i32 2, !"Dwarf Version", i32 2} diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll index 7030b6b4df2d..2683b42dc717 100644 --- a/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/test/Transforms/LoopVectorize/no_outside_user.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -pass-remarks-analysis=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s 2>&1 | FileCheck %s ; CHECK: remark: {{.*}}: loop not vectorized: value could not be identified as an induction or reduction variable ; CHECK: remark: {{.*}}: loop not vectorized: use of induction value outside of the loop is not handled by vectorizer diff --git a/test/Transforms/LoopVectorize/no_switch.ll b/test/Transforms/LoopVectorize/no_switch.ll index 1f139c26d790..842d262d3192 100644 --- a/test/Transforms/LoopVectorize/no_switch.ll +++ b/test/Transforms/LoopVectorize/no_switch.ll @@ -1,9 +1,17 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO ; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement -; CHECK: remark: source.cpp:4:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info (Force=true, Vector Width=4) ; CHECK: warning: source.cpp:4:5: loop not vectorized: failed explicitly specified loop vectorization +; NOANALYSIS-NOT: remark: {{.*}} +; NOANALYSIS: warning: source.cpp:4:5: loop not interleaved: failed explicitly specified loop interleaving + +; MOREINFO: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement +; MOREINFO: remark: source.cpp:4:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info (Force=true, Vector Width=4) +; MOREINFO: warning: source.cpp:4:5: loop not vectorized: failed explicitly specified loop vectorization + ; CHECK: _Z11test_switchPii ; CHECK-NOT: x i32> ; CHECK: ret @@ -11,7 +19,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; Function Attrs: nounwind optsize ssp uwtable -define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 { +define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 { entry: %cmp18 = icmp sgt i32 %Length, 0, !dbg !10 br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12 @@ -59,11 +67,11 @@ attributes #0 = { nounwind } !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} -!0 = !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2) !1 = !DIFile(filename: "source.cpp", directory: ".") !2 = !{} !3 = !{!4} -!4 = !DISubprogram(name: "test_switch", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, function: void (i32*, i32)* @_Z11test_switchPii, variables: !2) +!4 = distinct !DISubprogram(name: "test_switch", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2) !5 = !DIFile(filename: "source.cpp", directory: ".") !6 = !DISubroutineType(types: !2) !7 = !{i32 2, !"Dwarf Version", i32 2} diff --git a/test/Transforms/LoopVectorize/nontemporal.ll b/test/Transforms/LoopVectorize/nontemporal.ll new file mode 100644 index 000000000000..106b19031228 --- /dev/null +++ b/test/Transforms/LoopVectorize/nontemporal.ll @@ -0,0 +1,47 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios5.0.0" + +; CHECK-LABEL: @foo( +define void @foo(float* noalias %a, float* noalias %b, float* noalias %c, i32 %N) { +entry: + %cmp.4 = icmp sgt i32 %N, 0 + br i1 %cmp.4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + +; Check that we don't lose !nontemporal hint when vectorizing loads. +; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0 + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4, !nontemporal !0 + +; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it. +; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}} + %arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4 + %add = fadd float %0, %1 + +; Check that we don't lose !nontemporal hint when vectorizing stores. +; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0 + %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx4, align 4, !nontemporal !0 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry +; CHECK: ret void + ret void +} + +!0 = !{i32 1} diff --git a/test/Transforms/LoopVectorize/optsize.ll b/test/Transforms/LoopVectorize/optsize.ll index e183fda099a2..513657cd3723 100644 --- a/test/Transforms/LoopVectorize/optsize.ll +++ b/test/Transforms/LoopVectorize/optsize.ll @@ -1,18 +1,17 @@ ; This test verifies that the loop vectorizer will NOT produce a tail -; loop with Optimize for size attibute. +; loop with the optimize for size or the minimize size attributes. ; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -Os -debug -debug-only=loop-vectorize -S 2>&1 | FileCheck %s - -;CHECK-NOT: <2 x i8> -;CHECK-NOT: <4 x i8> -;CHECK: Aborting. A tail loop is required in Os. +; RUN: opt < %s -loop-vectorize -S | FileCheck %s target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" @tab = common global [32 x i8] zeroinitializer, align 1 -; Function Attrs: nounwind optsize -define i32 @foo() #0 { +define i32 @foo_optsize() #0 { +; CHECK-LABEL: @foo_optsize( +; CHECK-NOT: <2 x i8> +; CHECK-NOT: <4 x i8> + entry: br label %for.body @@ -31,4 +30,30 @@ for.end: ; preds = %for.body ret i32 0 } -attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { optsize } + +define i32 @foo_minsize() #1 { +; CHECK-LABEL: @foo_minsize( +; CHECK-NOT: <2 x i8> +; CHECK-NOT: <4 x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +attributes #1 = { minsize } + diff --git a/test/Transforms/LoopVectorize/ptr-induction.ll b/test/Transforms/LoopVectorize/ptr-induction.ll new file mode 100644 index 000000000000..47d33352763d --- /dev/null +++ b/test/Transforms/LoopVectorize/ptr-induction.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; This testcase causes SCEV to return a pointer-typed exit value. + +; CHECK: @f +; Expect that the pointer indvar has been converted into an integer indvar. +; CHECK: %index.next = add i64 %index, 4 +define i32 @f(i32* readonly %a, i32* readnone %b) #0 { +entry: + %cmp.6 = icmp ult i32* %a, %b + br i1 %cmp.6, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %a.pn = phi i32* [ %incdec.ptr8, %while.body ], [ %a, %while.body.preheader ] + %acc.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %incdec.ptr8 = getelementptr inbounds i32, i32* %a.pn, i64 1 + %0 = load i32, i32* %incdec.ptr8, align 1 + %add = add nuw nsw i32 %0, %acc.07 + %exitcond = icmp eq i32* %incdec.ptr8, %b + br i1 %exitcond, label %while.cond.while.end_crit_edge, label %while.body + +while.cond.while.end_crit_edge: ; preds = %while.body + %add.lcssa = phi i32 [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.cond.while.end_crit_edge, %entry + %acc.0.lcssa = phi i32 [ %add.lcssa, %while.cond.while.end_crit_edge ], [ 0, %entry ] + ret i32 %acc.0.lcssa +} diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll index 647e58a7e41f..63b138f1d560 100644 --- a/test/Transforms/LoopVectorize/reduction.ll +++ b/test/Transforms/LoopVectorize/reduction.ll @@ -175,8 +175,8 @@ for.end: ; preds = %for.body, %entry } ;CHECK-LABEL: @reduction_and( -;CHECK: and <4 x i32> ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1> +;CHECK: and <4 x i32> ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> ;CHECK: and <4 x i32> ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll index 6b63a0d8db6c..88dd2e4d66ca 100644 --- a/test/Transforms/LoopVectorize/reverse_induction.ll +++ b/test/Transforms/LoopVectorize/reverse_induction.ll @@ -96,8 +96,7 @@ loopend: ; CHECK-LABEL: @reverse_forward_induction_i64_i8( ; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %normalized.idx = sub i64 %index, 0 -; CHECK: %offset.idx = sub i64 1023, %normalized.idx +; CHECK: %offset.idx = sub i64 1023, %index ; CHECK: trunc i64 %index to i8 define void @reverse_forward_induction_i64_i8() { @@ -122,10 +121,8 @@ while.end: ; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed( ; CHECK: vector.body: -; CHECK: %index = phi i64 [ 129, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %normalized.idx = sub i64 %index, 129 -; CHECK: %offset.idx = sub i64 1023, %normalized.idx -; CHECK: trunc i64 %index to i8 +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i64 1023, %index define void @reverse_forward_induction_i64_i8_signed() { entry: diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll index 1f07d3f69594..3673b71db30d 100644 --- a/test/Transforms/LoopVectorize/runtime-check.ll +++ b/test/Transforms/LoopVectorize/runtime-check.ll @@ -11,9 +11,9 @@ target triple = "x86_64-apple-macosx10.9.0" ;CHECK-LABEL: define i32 @foo ;CHECK: for.body.preheader: -;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]] +;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]] ;CHECK: vector.memcheck: -;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph, !dbg [[BODY_LOC]] +;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]] ;CHECK: load <4 x float> define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp { entry: @@ -73,7 +73,7 @@ loopexit: !2 = !{} !3 = !DISubroutineType(types: !2) !4 = !DIFile(filename: "test.cpp", directory: "/tmp") -!5 = !DISubprogram(name: "foo", scope: !4, file: !4, line: 99, type: !3, isLocal: false, isDefinition: true, scopeLine: 100, flags: DIFlagPrototyped, isOptimized: false, variables: !2) +!5 = distinct !DISubprogram(name: "foo", scope: !4, file: !4, line: 99, type: !3, isLocal: false, isDefinition: true, scopeLine: 100, flags: DIFlagPrototyped, isOptimized: false, variables: !2) !6 = !DILocation(line: 100, column: 1, scope: !5) !7 = !DILocation(line: 101, column: 1, scope: !5) !8 = !DILocation(line: 102, column: 1, scope: !5) diff --git a/test/Transforms/LoopVectorize/runtime-limit.ll b/test/Transforms/LoopVectorize/runtime-limit.ll index 6bc71e160ccd..a7f692cef170 100644 --- a/test/Transforms/LoopVectorize/runtime-limit.ll +++ b/test/Transforms/LoopVectorize/runtime-limit.ll @@ -1,12 +1,25 @@ -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.8.0" + +; First loop produced diagnostic pass remark. +;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) +; Second loop produces diagnostic analysis remark. +;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations + +; First loop produced diagnostic pass remark. +;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) +; Second loop produces diagnostic pass remark. +;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) ; We are vectorizing with 6 runtime checks. ;CHECK-LABEL: func1x6( ;CHECK: <4 x i32> ;CHECK: ret +;OVERRIDE-LABEL: func1x6( +;OVERRIDE: <4 x i32> +;OVERRIDE: ret define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: br label %for.body @@ -41,6 +54,10 @@ for.end: ; preds = %for.body ;CHECK-LABEL: func2x6( ;CHECK-NOT: <4 x i32> ;CHECK: ret +; We vectorize with 12 checks if a vectorization hint is provided. +;OVERRIDE-LABEL: func2x6( +;OVERRIDE: <4 x i32> +;OVERRIDE: ret define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: br label %for.body |
