diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2014-11-24 09:08:18 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2014-11-24 09:08:18 +0000 |
| commit | 5ca98fd98791947eba83a1ed3f2c8191ef7afa6c (patch) | |
| tree | f5944309621cee4fe0976be6f9ac619b7ebfc4c2 /test/Transforms/LoopVectorize | |
| parent | 68bcb7db193e4bc81430063148253d30a791023e (diff) | |
Notes
Diffstat (limited to 'test/Transforms/LoopVectorize')
53 files changed, 2118 insertions, 63 deletions
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll new file mode 100644 index 000000000000..9962c3d76a66 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Function Attrs: nounwind +define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) { +;CHECK-LABEL: array_add +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +entry: + %cmp10 = icmp sgt i32 %size, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32* %b, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32* %c, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32* %c +} diff --git a/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll new file mode 100644 index 000000000000..f8eb3ed1f35f --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Function Attrs: nounwind +define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) { +;CHECK-LABEL: array_add +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +entry: + %cmp10 = icmp sgt i32 %size, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32* %b, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32* %c, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret i32* %c +} diff --git a/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll new file mode 100644 index 000000000000..bb285382e53c --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll @@ -0,0 +1,85 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" + +@kernel = global [512 x float] zeroinitializer, align 16 +@kernel2 = global [512 x float] zeroinitializer, align 16 +@kernel3 = global [512 x float] zeroinitializer, align 16 +@kernel4 = global [512 x float] zeroinitializer, align 16 +@src_data = global [1536 x float] zeroinitializer, align 16 +@r_ = global i8 0, align 1 +@g_ = global i8 0, align 1 +@b_ = global i8 0, align 1 + +; We don't want to vectorize most loops containing gathers because they are +; expensive. +; Make sure we don't vectorize it. +; CHECK-NOT: x float> + +define void @_Z4testmm(i64 %size, i64 %offset) { +entry: + %cmp53 = icmp eq i64 %size, 0 + br i1 %cmp53, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.body: + %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] + %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] + %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] + %add = add i64 %v.055, %offset + %mul = mul i64 %add, 3 + %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul + %0 = load float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055 + %1 = load float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %1 + %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055 + %2 = load float* %arrayidx4, align 4 + %mul5 = fmul fast float %mul3, %2 + %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055 + %3 = load float* %arrayidx6, align 4 + %mul7 = fmul fast float %mul5, %3 + %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055 + %4 = load float* %arrayidx8, align 4 + %mul9 = fmul fast float %mul7, %4 + %add10 = fadd fast float %r.057, %mul9 + %arrayidx.sum = add i64 %mul, 1 + %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum + %5 = load float* %arrayidx11, align 4 + %mul13 = fmul fast float %1, %5 + %mul15 = fmul fast float %2, %mul13 + %mul17 = fmul fast float %3, %mul15 + %mul19 = fmul fast float %4, %mul17 + %add20 = fadd fast float %g.056, %mul19 + %arrayidx.sum52 = add i64 %mul, 2 + %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52 + %6 = load float* %arrayidx21, align 4 + %mul23 = fmul fast float %1, %6 + %mul25 = fmul fast float %2, %mul23 + %mul27 = fmul fast float %3, %mul25 + %mul29 = fmul fast float %4, %mul27 + %add30 = fadd fast float %b.054, %mul29 + %inc = add i64 %v.055, 1 + %exitcond = icmp ne i64 %inc, %size + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: + %add30.lcssa = phi float [ %add30, %for.body ] + %add20.lcssa = phi float [ %add20, %for.body ] + %add10.lcssa = phi float [ %add10, %for.body ] + %phitmp = fptoui float %add10.lcssa to i8 + %phitmp60 = fptoui float %add20.lcssa to i8 + %phitmp61 = fptoui float %add30.lcssa to i8 + br label %for.end + +for.end: + %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] + store i8 %r.0.lcssa, i8* @r_, align 1 + store i8 %g.0.lcssa, i8* @g_, align 1 + store i8 %b.0.lcssa, i8* @b_, align 1 + ret void +} diff --git a/test/Transforms/LoopVectorize/AArch64/lit.local.cfg b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg new file mode 100644 index 000000000000..937cffb2c119 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg @@ -0,0 +1,5 @@ +config.suffixes = ['.ll'] + +if not 'AArch64' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll index 39363ab2d802..8843fc2d2b1a 100644 --- a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll +++ b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios3.0.0" @@ -30,3 +31,41 @@ define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp { %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] ret i32 %sum.0.lcssa } + +; Verify the register limit. On arm we don't have 16 allocatable registers. +;SWIFTUNROLL-LABEL: @register_limit( +;SWIFTUNROLL: load i32 +;SWIFTUNROLL-NOT: load i32 +define i32 @register_limit(i32* nocapture %A, i32 %n) { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ] + %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ] + %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ] + %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i32 %i.02 + %3 = load i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %6 = add nsw i32 %3, %sum.02 + %7 = add nsw i32 %3, %sum.03 + %8 = add nsw i32 %3, %sum.04 + %9 = add nsw i32 %3, %sum.05 + %10 = add nsw i32 %3, %sum.05 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ] + %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ] + %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ] + %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ] + ret i32 %sum.0.lcssa +} diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg index 8a3ba96497e7..98c6700c209d 100644 --- a/test/Transforms/LoopVectorize/ARM/lit.local.cfg +++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg @@ -1,4 +1,3 @@ -targets = set(config.root.targets_to_build.split()) -if not 'ARM' in targets: +if not 'ARM' in config.root.targets: config.unsupported = True diff --git a/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg b/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg new file mode 100644 index 000000000000..5d33887ff0a4 --- /dev/null +++ b/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'PowerPC' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll new file mode 100644 index 000000000000..6cd9c4d610b5 --- /dev/null +++ b/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -loop-vectorize -instcombine -S | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +%struct.GlobalData = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float], [5 x i32], [12 x i8], [32000 x float], [7 x i32], [4 x i8], [32000 x float], [11 x i32], [4 x i8], [32000 x float], [13 x i32], [12 x i8], [256 x [256 x float]], [17 x i32], [12 x i8], [256 x [256 x float]], [19 x i32], [4 x i8], [256 x [256 x float]], [23 x i32], [4 x i8], [256 x [256 x float]] } + +@global_data = external global %struct.GlobalData, align 16 +@ntimes = external hidden unnamed_addr global i32, align 4 + +define signext i32 @s173() #0 { +entry: + %0 = load i32* @ntimes, align 4 + %cmp21 = icmp sgt i32 %0, 0 + br i1 %cmp21, label %for.cond1.preheader, label %for.end12 + +for.cond1.preheader: ; preds = %for.end, %entry + %nl.022 = phi i32 [ %inc11, %for.end ], [ 0, %entry ] + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.cond1.preheader + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx = getelementptr inbounds %struct.GlobalData* @global_data, i64 0, i32 0, i64 %indvars.iv + %1 = load float* %arrayidx, align 4 + %arrayidx5 = getelementptr inbounds %struct.GlobalData* @global_data, i64 0, i32 3, i64 %indvars.iv + %2 = load float* %arrayidx5, align 4 + %add = fadd float %1, %2 + %3 = add nsw i64 %indvars.iv, 16000 + %arrayidx8 = getelementptr inbounds %struct.GlobalData* @global_data, i64 0, i32 0, i64 %3 + store float %add, float* %arrayidx8, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 16000 + br i1 %exitcond, label %for.end, label %for.body3 + +for.end: ; preds = %for.body3 + %inc11 = add nsw i32 %nl.022, 1 + %4 = load i32* @ntimes, align 4 + %mul = mul nsw i32 %4, 10 + %cmp = icmp slt i32 %inc11, %mul + br i1 %cmp, label %for.cond1.preheader, label %for.end12 + +for.end12: ; preds = %for.end, %entry + ret i32 0 + +; CHECK-LABEL: @s173 +; CHECK: load <4 x float>* +; CHECK: add i64 %index, 16000 +; CHECK: ret i32 0 +} + +attributes #0 = { nounwind } + diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll index 885418c0fdd9..9c69ba87f392 100644 --- a/test/Transforms/LoopVectorize/X86/already-vectorized.ll +++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s +; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s ; REQUIRES: asserts ; We want to make sure that we don't even try to vectorize loops again ; The vectorizer used to mark the un-vectorized loop only as already vectorized @@ -40,7 +40,7 @@ for.end: ; preds = %for.body ; Now, we check for the Hint metadata ; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]} -; CHECK: [[width]] = metadata !{metadata !"llvm.vectorizer.width", i32 1} -; CHECK: [[unroll]] = metadata !{metadata !"llvm.vectorizer.unroll", i32 1} +; CHECK: [[width]] = metadata !{metadata !"llvm.loop.vectorize.width", i32 1} +; CHECK: [[unroll]] = metadata !{metadata !"llvm.loop.interleave.count", i32 1} ; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]} diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll new file mode 100644 index 000000000000..a2208668177d --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/avx512.ll @@ -0,0 +1,35 @@ +; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; Verify that we generate 512-bit wide vectors for a basic integer memset +; loop. + +; CHECK-LABEL: f: +; CHECK: vmovdqu32 %zmm{{.}}, ( +; CHECK-NOT: %ymm + +define void @f(i32* %a, i32 %n) { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %n, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll new file mode 100644 index 000000000000..529ed883c3b4 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll @@ -0,0 +1,39 @@ +; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx" + +@float_array = common global [10000 x float] zeroinitializer, align 16 +@unsigned_array = common global [10000 x i32] zeroinitializer, align 16 + +; If we need to scalarize the fptoui and then use inserts to build up the +; vector again, then there is certainly no value in going 256-bit wide. +; CHECK-NOT: vinserti128 + +define void @convert(i32 %N) { +entry: + %0 = icmp eq i32 %N, 0 + br i1 %0, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds [10000 x float]* @float_array, i64 0, i64 %indvars.iv + %1 = load float* %arrayidx, align 4 + %conv = fptoui float %1 to i32 + %arrayidx2 = getelementptr inbounds [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + diff --git a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll new file mode 100644 index 000000000000..ef3e3bec793a --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx" + +@n = global i32 10000, align 4 +@double_array = common global [10000 x double] zeroinitializer, align 16 +@unsigned_array = common global [10000 x i32] zeroinitializer, align 16 + +; If we need to scalarize the fptoui and then use inserts to build up the +; vector again, then there is certainly no value in going 256-bit wide. +; CHECK-NOT: vpinsrd + +define void @convert() { +entry: + %0 = load i32* @n, align 4 + %cmp4 = icmp eq i32 %0, 0 + br i1 %cmp4, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds [10000 x double]* @double_array, i64 0, i64 %indvars.iv + %1 = load double* %arrayidx, align 8 + %conv = fptoui double %1 to i32 + %arrayidx2 = getelementptr inbounds [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %2 = trunc i64 %indvars.iv.next to i32 + %cmp = icmp ult i32 %2, %0 + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll new file mode 100644 index 000000000000..23e62275ce04 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + + +; CHECK: cost of 7 for VF 8 For instruction: %conv = fptosi float %tmp to i8 +define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind { +entry: + br label %for.body +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv + %tmp = load float* %arrayidx, align 4 + %conv = fptosi float %tmp to i8 + %arrayidx2 = getelementptr inbounds i8* %a, i64 %indvars.iv + store i8 %conv, i8* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg index ba763cf03ffc..e71f3cc4c41e 100644 --- a/test/Transforms/LoopVectorize/X86/lit.local.cfg +++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg @@ -1,4 +1,3 @@ -targets = set(config.root.targets_to_build.split()) -if not 'X86' in targets: +if not 'X86' in config.root.targets: config.unsupported = True diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll new file mode 100644 index 000000000000..8e0ca417b404 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -0,0 +1,176 @@ +; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1 +; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2 +; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os +; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz +; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC +; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC +; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2 +; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2 +; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS + +; This file tests the llvm.loop.vectorize.enable metadata forcing +; vectorization even when optimization levels are too low, or when +; vectorization is disabled. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; O1-LABEL: @enabled( +; O1: store <4 x i32> +; O1: ret i32 +; O2-LABEL: @enabled( +; O2: store <4 x i32> +; O2: ret i32 +; O3-LABEL: @enabled( +; O3: store <4 x i32> +; O3: ret i32 +; Pragma always wins! +; O3DIS-LABEL: @enabled( +; O3DIS: store <4 x i32> +; O3DIS: ret i32 +; Os-LABEL: @enabled( +; Os: store <4 x i32> +; Os: ret i32 +; Oz-LABEL: @enabled( +; Oz: store <4 x i32> +; Oz: ret i32 +; O1VEC-LABEL: @enabled( +; O1VEC: store <4 x i32> +; O1VEC: ret i32 +; OzVEC-LABEL: @enabled( +; OzVEC: store <4 x i32> +; OzVEC: ret i32 +; O1VEC2-LABEL: @enabled( +; O1VEC2: store <4 x i32> +; O1VEC2: ret i32 +; OzVEC2-LABEL: @enabled( +; OzVEC2: store <4 x i32> +; OzVEC2: ret i32 + +define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %N + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + %1 = load i32* %a, align 4 + ret i32 %1 +} + +; O1-LABEL: @nopragma( +; O1-NOT: store <4 x i32> +; O1: ret i32 +; O2-LABEL: @nopragma( +; O2: store <4 x i32> +; O2: ret i32 +; O3-LABEL: @nopragma( +; O3: store <4 x i32> +; O3: ret i32 +; O3DIS-LABEL: @nopragma( +; O3DIS-NOT: store <4 x i32> +; O3DIS: ret i32 +; Os-LABEL: @nopragma( +; Os: store <4 x i32> +; Os: ret i32 +; Oz-LABEL: @nopragma( +; Oz-NOT: store <4 x i32> +; Oz: ret i32 +; O1VEC-LABEL: @nopragma( +; O1VEC: store <4 x i32> +; O1VEC: ret i32 +; OzVEC-LABEL: @nopragma( +; OzVEC: store <4 x i32> +; OzVEC: ret i32 +; O1VEC2-LABEL: @nopragma( +; O1VEC2: store <4 x i32> +; O1VEC2: ret i32 +; OzVEC2-LABEL: @nopragma( +; OzVEC2: store <4 x i32> +; OzVEC2: ret i32 + +define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %N + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + %1 = load i32* %a, align 4 + ret i32 %1 +} + +; O1-LABEL: @disabled( +; O1-NOT: store <4 x i32> +; O1: ret i32 +; O2-LABEL: @disabled( +; O2-NOT: store <4 x i32> +; O2: ret i32 +; O3-LABEL: @disabled( +; O3-NOT: store <4 x i32> +; O3: ret i32 +; O3DIS-LABEL: @disabled( +; O3DIS-NOT: store <4 x i32> +; O3DIS: ret i32 +; Os-LABEL: @disabled( +; Os-NOT: store <4 x i32> +; Os: ret i32 +; Oz-LABEL: @disabled( +; Oz-NOT: store <4 x i32> +; Oz: ret i32 +; O1VEC-LABEL: @disabled( +; O1VEC-NOT: store <4 x i32> +; O1VEC: ret i32 +; OzVEC-LABEL: @disabled( +; OzVEC-NOT: store <4 x i32> +; OzVEC: ret i32 +; O1VEC2-LABEL: @disabled( +; O1VEC2-NOT: store <4 x i32> +; O1VEC2: ret i32 +; OzVEC2-LABEL: @disabled( +; OzVEC2-NOT: store <4 x i32> +; OzVEC2: ret i32 + +define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %N + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body + %1 = load i32* %a, align 4 + ret i32 %1 +} + +!0 = metadata !{metadata !0, metadata !1} +!1 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 1} +!2 = metadata !{metadata !2, metadata !3} +!3 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 0} diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll index 14ac417bb573..bcf16aa5db13 100644 --- a/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/test/Transforms/LoopVectorize/X86/small-size.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -115,6 +115,31 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture ret void } +; N is unknown, we need a tail. Can't vectorize because the loop is cold. +;CHECK-LABEL: @example4( +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) { + %1 = icmp eq i32 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0 + +.lr.ph: ; preds = %0, %.lr.ph + %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ] + %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ] + %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ] + %2 = add nsw i32 %.05, -1 + %3 = getelementptr inbounds i32* %.023, i64 1 + %4 = load i32* %.023, align 16 + %5 = getelementptr inbounds i32* %.014, i64 1 + store i32 %4, i32* %.014, align 16 + %6 = icmp eq i32 %2, 0 + br i1 %6, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + +!0 = metadata !{metadata !"branch_weights", i32 64, i32 4} ; We can't vectorize this one because we need a runtime ptr check. ;CHECK-LABEL: @example23( diff --git a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll new file mode 100644 index 000000000000..86c32b2d2ee4 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll @@ -0,0 +1,26 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + + +; CHECK: cost of 20 for VF 2 For instruction: %conv = uitofp i64 %tmp to double +; CHECK: cost of 40 for VF 4 For instruction: %conv = uitofp i64 %tmp to double +define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind { +entry: + br label %for.body +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i64* %a, i64 %indvars.iv + %tmp = load i64* %arrayidx, align 4 + %conv = uitofp i64 %tmp to double + %arrayidx2 = getelementptr inbounds double* %b, i64 %indvars.iv + store double %conv, double* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll index ea107dc4dc51..d5024bb13210 100644 --- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll +++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -1,13 +1,26 @@ -; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S \ +; RUN: | FileCheck %s --check-prefix=CHECK-VECTOR +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-unroll=0 -dce -S \ +; RUN: | FileCheck %s --check-prefix=CHECK-SCALAR target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -;CHECK-LABEL: @foo( -;CHECK: load <4 x i32> -;CHECK-NOT: load <4 x i32> -;CHECK: store <4 x i32> -;CHECK-NOT: store <4 x i32> -;CHECK: ret + +; We don't unroll this loop because it has a small constant trip count. +; +; CHECK-VECTOR-LABEL: @foo( +; CHECK-VECTOR: load <4 x i32> +; CHECK-VECTOR-NOT: load <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR-NOT: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @foo( +; CHECK-SCALAR: load i32* +; CHECK-SCALAR-NOT: load i32* +; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 +; CHECK-SCALAR: ret define i32 @foo(i32* nocapture %A) nounwind uwtable ssp { br label %1 @@ -26,10 +39,18 @@ define i32 @foo(i32* nocapture %A) nounwind uwtable ssp { ret i32 undef } -;CHECK-LABEL: @bar( -;CHECK: store <4 x i32> -;CHECK: store <4 x i32> -;CHECK: ret +; But this is a good small loop to unroll as we don't know of a bound on its +; trip count. +; +; CHECK-VECTOR-LABEL: @bar( +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR: store <4 x i32> +; CHECK-VECTOR: ret +; +; CHECK-SCALAR-LABEL: @bar( +; CHECK-SCALAR: store i32 +; CHECK-SCALAR: store i32 +; CHECK-SCALAR: ret define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -48,3 +69,32 @@ define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp { ._crit_edge: ; preds = %.lr.ph, %0 ret i32 undef } + +; Also unroll if we need a runtime check but it was going to be added for +; vectorization anyways. +; CHECK-VECTOR-LABEL: @runtime_chk( +; CHECK-VECTOR: store <4 x float> +; CHECK-VECTOR: store <4 x float> +; +; But not if the unrolling would introduce the runtime check. +; CHECK-SCALAR-LABEL: @runtime_chk( +; CHECK-SCALAR: store float +; CHECK-SCALAR-NOT: store float +define void @runtime_chk(float* %A, float* %B, float %N) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4 + %mul = fmul float %0, %N + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll new file mode 100644 index 000000000000..074313bde6f5 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll @@ -0,0 +1,93 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: LV: Loop hints: force=enabled +; CHECK: LV: Loop hints: force=? +; No more loops in the module +; CHECK-NOT: LV: Loop hints: force= +; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of loops vectorized + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; +; The source code for the test: +; +; #include <math.h> +; void foo(float* restrict A, float * restrict B, int size) +; { +; for (int i = 0; i < size; ++i) A[i] = sinf(B[i]); +; } +; + +; +; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata. +; + +define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) { +entry: + %cmp6 = icmp sgt i32 %size, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %call = tail call float @llvm.sin.f32(float %0) + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1 + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +!1 = metadata !{metadata !1, metadata !2} +!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} + +; +; This method will not be vectorized, as scalar cost is lower than any of vector costs. +; + +define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) { +entry: + %cmp6 = icmp sgt i32 %size, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3 + %call = tail call float @llvm.sin.f32(float %0) + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3 + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +declare float @llvm.sin.f32(float) nounwind readnone + +; Dummy metadata +!3 = metadata !{metadata !3} + diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll new file mode 100644 index 000000000000..97c31a148e3a --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -0,0 +1,73 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: LV: Loop hints: force=enabled +; CHECK: LV: Loop hints: force=? +; No more loops in the module +; CHECK-NOT: LV: Loop hints: force= +; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of loops vectorized + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; +; The source code for the test: +; +; void foo(float* restrict A, float* restrict B) +; { +; for (int i = 0; i < 20; ++i) A[i] += B[i]; +; } +; + +; +; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. +; +define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %add = fadd fast float %0, %1 + store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +!1 = metadata !{metadata !1, metadata !2} +!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} + +; +; This loop will not be vectorized as the trip count is below the threshold. +; +define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3 + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %add = fadd fast float %0, %1 + store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3 + +for.end: + ret void +} + +!3 = metadata !{metadata !3} + diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll index 59bb8d0054c5..e57cfefec07c 100644 --- a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s +; RUN: opt -basicaa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll new file mode 100644 index 000000000000..93f24cbf4909 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll @@ -0,0 +1,161 @@ +; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s + +; C/C++ code for tests +; void test(int *A, int Length) { +; #pragma clang loop vectorize(enable) interleave(enable) +; for (int i = 0; i < Length; i++) { +; A[i] = i; +; if (A[i] > Length) +; break; +; } +; } + +; void test_disabled(int *A, int Length) { +; #pragma clang loop vectorize(disable) interleave(disable) +; for (int i = 0; i < Length; i++) +; A[i] = i; +; } + +; void test_array_bounds(int *A, int *B, int Length) { +; #pragma clang loop vectorize(enable) +; for (int i = 0; i < Length; i++) +; A[i] = A[B[i]]; +; } + +; File, line, and column should match those specified in the metadata +; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations +; CHECK: remark: source.cpp:4:5: loop not vectorized: vectorization was not specified +; CHECK: remark: source.cpp:13:5: loop not vectorized: vector width and interleave count are explicitly set to 1 +; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds +; CHECK: remark: source.cpp:19:5: loop not vectorized: vectorization is explicitly enabled +; CHECK: warning: source.cpp:19:5: loop not vectorized: failed explicitly specified loop vectorization + +; CHECK: _Z4testPii +; CHECK-NOT: x i32> +; CHECK: ret + +; CHECK: _Z13test_disabledPii +; CHECK-NOT: x i32> +; CHECK: ret + +; CHECK: _Z17test_array_boundsPiS_i +; CHECK-NOT: x i32> +; CHECK: ret + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: nounwind optsize ssp uwtable +define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 { +entry: + %cmp10 = icmp sgt i32 %Length, 0, !dbg !12 + br i1 %cmp10, label %for.body, label %for.end, !dbg !12, !llvm.loop !14 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !16 + %0 = trunc i64 %indvars.iv to i32, !dbg !16 + store i32 %0, i32* %arrayidx, align 4, !dbg !16, !tbaa !18 + %cmp3 = icmp sle i32 %0, %Length, !dbg !22 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !12 + %1 = trunc i64 %indvars.iv.next to i32 + %cmp = icmp slt i32 %1, %Length, !dbg !12 + %or.cond = and i1 %cmp3, %cmp, !dbg !22 + br i1 %or.cond, label %for.body, label %for.end, !dbg !22 + +for.end: ; preds = %for.body, %entry + ret void, !dbg !24 +} + +; Function Attrs: nounwind optsize ssp uwtable +define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 { +entry: + %cmp4 = icmp sgt i32 %Length, 0, !dbg !25 + br i1 %cmp4, label %for.body, label %for.end, !dbg !25, !llvm.loop !27 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !30 + %0 = trunc i64 %indvars.iv to i32, !dbg !30 + store i32 %0, i32* %arrayidx, align 4, !dbg !30, !tbaa !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !25 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !25 + %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !25 + br i1 %exitcond, label %for.end, label %for.body, !dbg !25, !llvm.loop !27 + +for.end: ; preds = %for.body, %entry + ret void, !dbg !31 +} + +; Function Attrs: nounwind optsize ssp uwtable +define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 { +entry: + %cmp9 = icmp sgt i32 %Length, 0, !dbg !32 + br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !35 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv, !dbg !35 + %0 = load i32* %arrayidx, align 4, !dbg !35, !tbaa !18 + %idxprom1 = sext i32 %0 to i64, !dbg !35 + %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1, !dbg !35 + %1 = load i32* %arrayidx2, align 4, !dbg !35, !tbaa !18 + %arrayidx4 = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !35 + store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32 + %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !34 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void, !dbg !36 +} + +attributes #0 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!9, !10} +!llvm.ident = !{!11} + +!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} +!1 = metadata !{metadata !"source.cpp", metadata !"."} +!2 = metadata !{} +!3 = metadata !{metadata !4, metadata !7, metadata !8} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z4testPii, null, null, metadata !2, i32 1} +!5 = metadata !{i32 786473, metadata !1} +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} +!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_disabled", metadata !"test_disabled", metadata !"", i32 10, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z13test_disabledPii, null, null, metadata !2, i32 10} +!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_array_bounds", metadata !"test_array_bounds", metadata !"", i32 16, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, metadata !2, i32 16} +!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} +!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1} +!11 = metadata !{metadata !"clang version 3.5.0"} +!12 = metadata !{i32 3, i32 8, metadata !13, null} +!13 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0} +!14 = metadata !{metadata !14, metadata !15, metadata !15} +!15 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} +!16 = metadata !{i32 4, i32 5, metadata !17, null} +!17 = metadata !{i32 786443, metadata !1, metadata !13, i32 3, i32 36, i32 0, i32 1} +!18 = metadata !{metadata !19, metadata !19, i64 0} +!19 = metadata !{metadata !"int", metadata !20, i64 0} +!20 = metadata !{metadata !"omnipotent char", metadata !21, i64 0} +!21 = metadata !{metadata !"Simple C/C++ TBAA"} +!22 = metadata !{i32 5, i32 9, metadata !23, null} +!23 = metadata !{i32 786443, metadata !1, metadata !17, i32 5, i32 9, i32 0, i32 2} +!24 = metadata !{i32 8, i32 1, metadata !4, null} +!25 = metadata !{i32 12, i32 8, metadata !26, null} +!26 = metadata !{i32 786443, metadata !1, metadata !7, i32 12, i32 3, i32 0, i32 3} +!27 = metadata !{metadata !27, metadata !28, metadata !29} +!28 = metadata !{metadata !"llvm.loop.interleave.count", i32 1} +!29 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1} +!30 = metadata !{i32 13, i32 5, metadata !26, null} +!31 = metadata !{i32 14, i32 1, metadata !7, null} +!32 = metadata !{i32 18, i32 8, metadata !33, null} +!33 = metadata !{i32 786443, metadata !1, metadata !8, i32 18, i32 3, i32 0, i32 4} +!34 = metadata !{metadata !34, metadata !15} +!35 = metadata !{i32 19, i32 5, metadata !33, null} +!36 = metadata !{i32 20, i32 1, metadata !8, null} diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll new file mode 100644 index 000000000000..f6834477ff51 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s + +; This code has all the !dbg annotations needed to track source line information, +; but is missing the llvm.dbg.cu annotation. This prevents code generation from +; emitting debug info in the final output. +; RUN: llc -mtriple x86_64-pc-linux-gnu %s -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s +; DEBUG-OUTPUT-NOT: .loc +; DEBUG-OUTPUT-NOT: {{.*}}.debug_info + +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization factor: 4, unrolling interleave factor: 1) +; UNROLLED: remark: vectorization-remarks.c:17:8: unrolled with interleaving factor 4 (vectorization not beneficial) +; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vector width and interleave count are explicitly set to 1 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i32 @foo(i32 %n) #0 { +entry: + %diff = alloca i32, align 4 + %cb = alloca [16 x i8], align 16 + %cc = alloca [16 x i8], align 16 + store i32 0, i32* %diff, align 4, !dbg !10, !tbaa !11 + br label %for.body, !dbg !15 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !19 + %arrayidx = getelementptr inbounds [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19 + %0 = load i8* %arrayidx, align 1, !dbg !19, !tbaa !21 + %conv = sext i8 %0 to i32, !dbg !19 + %arrayidx2 = getelementptr inbounds [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19 + %1 = load i8* %arrayidx2, align 1, !dbg !19, !tbaa !21 + %conv3 = sext i8 %1 to i32, !dbg !19 + %sub = sub i32 %conv, %conv3, !dbg !19 + %add = add nsw i32 %sub, %add8, !dbg !19 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !15 + %exitcond = icmp eq i64 %indvars.iv.next, 16, !dbg !15 + br i1 %exitcond, label %for.end, label %for.body, !dbg !15 + +for.end: ; preds = %for.body + store i32 %add, i32* %diff, align 4, !dbg !19, !tbaa !11 + call void @ibar(i32* %diff) #2, !dbg !22 + ret i32 0, !dbg !23 +} + +declare void @ibar(i32*) #1 + +!llvm.module.flags = !{!7, !8} +!llvm.ident = !{!9} + +!1 = metadata !{metadata !"vectorization-remarks.c", metadata !"."} +!2 = metadata !{} +!3 = metadata !{metadata !4} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @foo, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo] +!5 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [./vectorization-remarks.c] +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4} +!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!9 = metadata !{metadata !"clang version 3.5.0 "} +!10 = metadata !{i32 8, i32 3, metadata !4, null} ; [ DW_TAG_imported_declaration ] +!11 = metadata !{metadata !12, metadata !12, i64 0} +!12 = metadata !{metadata !"int", metadata !13, i64 0} +!13 = metadata !{metadata !"omnipotent char", metadata !14, i64 0} +!14 = metadata !{metadata !"Simple C/C++ TBAA"} +!15 = metadata !{i32 17, i32 8, metadata !16, null} +!16 = metadata !{i32 786443, metadata !1, metadata !17, i32 17, i32 8, i32 2, i32 3} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c] +!17 = metadata !{i32 786443, metadata !1, metadata !18, i32 17, i32 8, i32 1, i32 2} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c] +!18 = metadata !{i32 786443, metadata !1, metadata !4, i32 17, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c] +!19 = metadata !{i32 18, i32 5, metadata !20, null} +!20 = metadata !{i32 786443, metadata !1, metadata !18, i32 17, i32 27, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c] +!21 = metadata !{metadata !13, metadata !13, i64 0} +!22 = metadata !{i32 20, i32 3, metadata !4, null} +!23 = metadata !{i32 21, i32 3, metadata !4, null} diff --git a/test/Transforms/LoopVectorize/XCore/lit.local.cfg b/test/Transforms/LoopVectorize/XCore/lit.local.cfg index 4d17d4642045..bb48713fe33e 100644 --- a/test/Transforms/LoopVectorize/XCore/lit.local.cfg +++ b/test/Transforms/LoopVectorize/XCore/lit.local.cfg @@ -1,3 +1,2 @@ -targets = set(config.root.targets_to_build.split()) -if not 'XCore' in targets: +if not 'XCore' in config.root.targets: config.unsupported = True diff --git a/test/Transforms/LoopVectorize/calloc.ll b/test/Transforms/LoopVectorize/calloc.ll index 7e7991616459..55c0a605450f 100644 --- a/test/Transforms/LoopVectorize/calloc.ll +++ b/test/Transforms/LoopVectorize/calloc.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" diff --git a/test/Transforms/LoopVectorize/control-flow.ll b/test/Transforms/LoopVectorize/control-flow.ll new file mode 100644 index 000000000000..e4ba77fa3daa --- /dev/null +++ b/test/Transforms/LoopVectorize/control-flow.ll @@ -0,0 +1,78 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s + +; C/C++ code for control flow test +; int test(int *A, int Length) { +; for (int i = 0; i < Length; i++) { +; if (A[i] > 10.0) goto end; +; A[i] = 0; +; } +; end: +; return 0; +; } + +; CHECK: remark: source.cpp:5:9: loop not vectorized: loop control flow is not understood by vectorizer +; CHECK: remark: source.cpp:5:9: loop not vectorized: vectorization was not specified + +; CHECK: _Z4testPii +; CHECK-NOT: x i32> +; CHECK: ret + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: nounwind optsize ssp uwtable +define i32 @_Z4testPii(i32* nocapture %A, i32 %Length) #0 { +entry: + %cmp8 = icmp sgt i32 %Length, 0, !dbg !10 + br i1 %cmp8, label %for.body.preheader, label %end, !dbg !10 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !12 + +for.body: ; preds = %for.body.preheader, %if.else + %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !12 + %0 = load i32* %arrayidx, align 4, !dbg !12, !tbaa !15 + %cmp1 = icmp sgt i32 %0, 10, !dbg !12 + br i1 %cmp1, label %end.loopexit, label %if.else, !dbg !12 + +if.else: ; preds = %for.body + store i32 0, i32* %arrayidx, align 4, !dbg !19, !tbaa !15 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10 + %1 = trunc i64 %indvars.iv.next to i32, !dbg !10 + %cmp = icmp slt i32 %1, %Length, !dbg !10 + br i1 %cmp, label %for.body, label %end.loopexit, !dbg !10 + +end.loopexit: ; preds = %if.else, %for.body + br label %end + +end: ; preds = %end.loopexit, %entry + ret i32 0, !dbg !20 +} + +attributes #0 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!7, !8} +!llvm.ident = !{!9} + +!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} +!1 = metadata !{metadata !"source.cpp", metadata !"."} +!2 = metadata !{} +!3 = metadata !{metadata !4} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32*, i32)* @_Z4testPii, null, null, metadata !2, i32 2} +!5 = metadata !{i32 786473, metadata !1} +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} +!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} +!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1} +!9 = metadata !{metadata !"clang version 3.5.0"} +!10 = metadata !{i32 3, i32 8, metadata !11, null} +!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0} +!12 = metadata !{i32 5, i32 9, metadata !13, null} +!13 = metadata !{i32 786443, metadata !1, metadata !14, i32 5, i32 9, i32 0, i32 2} +!14 = metadata !{i32 786443, metadata !1, metadata !11, i32 4, i32 3, i32 0, i32 1} +!15 = metadata !{metadata !16, metadata !16, i64 0} +!16 = metadata !{metadata !"int", metadata !17, i64 0} +!17 = metadata !{metadata !"omnipotent char", metadata !18, i64 0} +!18 = metadata !{metadata !"Simple C/C++ TBAA"} +!19 = metadata !{i32 8, i32 7, metadata !13, null} +!20 = metadata !{i32 12, i32 3, metadata !4, null} diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll index a4ebb4284881..21d09372d546 100644 --- a/test/Transforms/LoopVectorize/flags.ll +++ b/test/Transforms/LoopVectorize/flags.ll @@ -51,3 +51,29 @@ define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp { ._crit_edge: ; preds = %.lr.ph, %0 ret i32 undef } + +; Make sure we copy fast math flags and use them for the final reduction. +; CHECK-LABEL: fast_math +; CHECK: load <4 x float> +; CHECK: fadd fast <4 x float> +; CHECK: br +; CHECK: fadd fast <4 x float> +; CHECK: fadd fast <4 x float> +define float @fast_math(float* noalias %s) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %q.04 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float* %s, i64 %indvars.iv + %0 = load float* %arrayidx, align 4 + %add = fadd fast float %q.04, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %add.lcssa = phi float [ %add, %for.body ] + ret float %add.lcssa +} diff --git a/test/Transforms/LoopVectorize/float-reduction.ll b/test/Transforms/LoopVectorize/float-reduction.ll index c45098dd2c3b..0dfbab07279a 100644 --- a/test/Transforms/LoopVectorize/float-reduction.ll +++ b/test/Transforms/LoopVectorize/float-reduction.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ;CHECK-LABEL: @foo( -;CHECK: fadd <4 x float> +;CHECK: fadd fast <4 x float> ;CHECK: ret define float @foo(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp { entry: diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll index d8959d4c106a..b6cde5d00f5e 100644 --- a/test/Transforms/LoopVectorize/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/gcc-examples.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" diff --git a/test/Transforms/LoopVectorize/global_alias.ll b/test/Transforms/LoopVectorize/global_alias.ll index 0118fb47412a..d64d67f6a5b1 100644 --- a/test/Transforms/LoopVectorize/global_alias.ll +++ b/test/Transforms/LoopVectorize/global_alias.ll @@ -387,7 +387,7 @@ for.end: ; preds = %for.cond ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias08( -; CHECK: sub nsw <4 x i32> +; CHECK: sub <4 x i32> ; CHECK: ret define i32 @noAlias08(i32 %a) #0 { @@ -439,7 +439,7 @@ for.end: ; preds = %for.cond ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias09( -; CHECK: sub nsw <4 x i32> +; CHECK: sub <4 x i32> ; CHECK: ret define i32 @noAlias09(i32 %a) #0 { @@ -491,7 +491,7 @@ for.end: ; preds = %for.cond ; return *(PA+a); ; } ; CHECK-LABEL: define i32 @noAlias10( -; CHECK-NOT: sub nsw <4 x i32> +; CHECK-NOT: sub {{.*}} <4 x i32> ; CHECK: ret ; ; TODO: This test vectorizes (with run-time check) on real targets with -O3) @@ -721,7 +721,7 @@ for.end: ; preds = %for.cond ; return Foo.A[a]; ; } ; CHECK-LABEL: define i32 @noAlias14( -; CHECK: sub nsw <4 x i32> +; CHECK: sub <4 x i32> ; CHECK: ret define i32 @noAlias14(i32 %a) #0 { diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll index dbe0243a8110..6e3e8ed27853 100644 --- a/test/Transforms/LoopVectorize/if-conversion.ll +++ b/test/Transforms/LoopVectorize/if-conversion.ll @@ -156,7 +156,7 @@ for.body: br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end cond.false: - %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c) to i32)) + %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)) br label %cond.end cond.end: diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll new file mode 100644 index 000000000000..7b0e181c845f --- /dev/null +++ b/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -0,0 +1,126 @@ +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-unroll=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-unroll=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; Test predication of stores. +define i32 @test(i32* nocapture %f) #0 { +entry: + br label %for.body + +; VEC-LABEL: test +; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100> +; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20> +; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true> +; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 +; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true +; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] +; +; VEC: [[cond]]: +; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 +; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 +; VEC: br label %[[else:.+]] +; +; VEC: [[else]]: +; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 +; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true +; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] +; +; VEC: [[cond2]]: +; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 +; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 +; VEC: br label %[[else2:.+]] +; +; VEC: [[else2]]: + +; UNROLL-LABEL: test +; UNROLL: vector.body: +; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0 +; UNROLL: %[[IND1:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 1 +; UNROLL: %[[v0:[a-zA-Z0-9]+]] = getelementptr inbounds i32* %f, i64 %[[IND]] +; UNROLL: %[[v1:[a-zA-Z0-9]+]] = getelementptr inbounds i32* %f, i64 %[[IND1]] +; UNROLL: %[[v2:[a-zA-Z0-9]+]] = load i32* %[[v0]], align 4 +; UNROLL: %[[v3:[a-zA-Z0-9]+]] = load i32* %[[v1]], align 4 +; UNROLL: %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100 +; UNROLL: %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100 +; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20 +; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20 +; UNROLL: %[[v8:[a-zA-Z0-9]+]] = icmp eq i1 %[[v4]], true +; UNROLL: br i1 %[[v8]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]] +; +; UNROLL: [[cond]]: +; UNROLL: store i32 %[[v6]], i32* %[[v0]], align 4 +; UNROLL: br label %[[else]] +; +; UNROLL: [[else]]: +; UNROLL: %[[v9:[a-zA-Z0-9]+]] = icmp eq i1 %[[v5]], true +; UNROLL: br i1 %[[v9]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]] +; +; UNROLL: [[cond2]]: +; UNROLL: store i32 %[[v7]], i32* %[[v1]], align 4 +; UNROLL: br label %[[else2]] +; +; UNROLL: [[else2]]: + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32* %f, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %0, 100 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %add = add nsw i32 %0, 20 + store i32 %add, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 0 +} + +; Track basic blocks when unrolling conditional blocks. This code used to assert +; because we did not update the phi nodes with the proper predecessor in the +; vectorized loop body. +; PR18724 + +; UNROLL-LABEL: bug18724 +; UNROLL: store i32 +; UNROLL: store i32 + +define void @bug18724() { +entry: + br label %for.body9 + +for.body9: + br i1 undef, label %for.inc26, label %for.body14 + +for.body14: + %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc23 ], [ undef, %for.body9 ] + %iNewChunks.120 = phi i32 [ %iNewChunks.2, %for.inc23 ], [ undef, %for.body9 ] + %arrayidx16 = getelementptr inbounds [768 x i32]* undef, i64 0, i64 %indvars.iv3 + %tmp = load i32* %arrayidx16, align 4 + br i1 undef, label %if.then18, label %for.inc23 + +if.then18: + store i32 2, i32* %arrayidx16, align 4 + %inc21 = add nsw i32 %iNewChunks.120, 1 + br label %for.inc23 + +for.inc23: + %iNewChunks.2 = phi i32 [ %inc21, %if.then18 ], [ %iNewChunks.120, %for.body14 ] + %indvars.iv.next4 = add nsw i64 %indvars.iv3, 1 + %tmp1 = trunc i64 %indvars.iv3 to i32 + %cmp13 = icmp slt i32 %tmp1, 0 + br i1 %cmp13, label %for.body14, label %for.inc26 + +for.inc26: + %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ] + unreachable +} diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll index d35bd58a0281..71bedb7334ac 100644 --- a/test/Transforms/LoopVectorize/increment.ll +++ b/test/Transforms/LoopVectorize/increment.ll @@ -34,7 +34,7 @@ define void @inc(i32 %n) nounwind uwtable noinline ssp { ret void } -; Can't vectorize this loop because the access to A[X] is non linear. +; Can't vectorize this loop because the access to A[X] is non-linear. ; ; for (i = 0; i < n; ++i) { ; A[B[i]]++; diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 50c3b6b6e79b..7dabcb2ba04f 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -75,7 +75,7 @@ loopexit: ; PR17532 ; CHECK-LABEL: i8_loop -; CHECK; icmp eq i32 {{.*}}, 256 +; CHECK: icmp eq i32 {{.*}}, 256 define i32 @i8_loop() nounwind readnone ssp uwtable { br label %1 @@ -92,7 +92,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable { } ; CHECK-LABEL: i16_loop -; CHECK; icmp eq i32 {{.*}}, 65536 +; CHECK: icmp eq i32 {{.*}}, 65536 define i32 @i16_loop() nounwind readnone ssp uwtable { br label %1 @@ -108,3 +108,64 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; <label>:5 ; preds = %1 ret i32 %2 } + +; This loop has a backedge taken count of i32_max. We need to check for this +; condition and branch directly to the scalar loop. + +; CHECK-LABEL: max_i32_backedgetaken +; CHECK: %backedge.overflow = icmp eq i32 -1, -1 +; CHECK: br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked + +; CHECK: scalar.ph: +; CHECK: %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ] +; CHECK: %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ] + +define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { + + br label %1 + +; <label>:1 ; preds = %1, %0 + %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ] + %b.0 = phi i32 [ 0, %0 ], [ %3, %1 ] + %2 = and i32 %a.0, 4 + %3 = add i32 %b.0, -1 + %4 = icmp eq i32 %3, 0 + br i1 %4, label %5, label %1 + +; <label>:5 ; preds = %1 + ret i32 %2 +} + +; When generating the overflow check we must sure that the induction start value +; is defined before the branch to the scalar preheader. + +; CHECK-LABEL: testoverflowcheck +; CHECK: entry +; CHECK: %[[LOAD:.*]] = load i8 +; CHECK: %[[VAL:.*]] = zext i8 %[[LOAD]] to i32 +; CHECK: br + +; CHECK: scalar.ph +; CHECK: phi i32 [ %{{.*}}, %middle.block ], [ %[[VAL]], %entry ] + +@e = global i8 1, align 1 +@d = common global i32 0, align 4 +@c = common global i32 0, align 4 +define i32 @testoverflowcheck() { +entry: + %.pr.i = load i8* @e, align 1 + %0 = load i32* @d, align 4 + %c.promoted.i = load i32* @c, align 4 + br label %cond.end.i + +cond.end.i: + %inc4.i = phi i8 [ %.pr.i, %entry ], [ %inc.i, %cond.end.i ] + %and3.i = phi i32 [ %c.promoted.i, %entry ], [ %and.i, %cond.end.i ] + %and.i = and i32 %0, %and3.i + %inc.i = add i8 %inc4.i, 1 + %tobool.i = icmp eq i8 %inc.i, 0 + br i1 %tobool.i, label %loopexit, label %cond.end.i + +loopexit: + ret i32 %and.i +} diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll index c3d570c03a77..7dfaf03b0f2d 100644 --- a/test/Transforms/LoopVectorize/intrinsic.ll +++ b/test/Transforms/LoopVectorize/intrinsic.ll @@ -1090,3 +1090,105 @@ for.end: ; preds = %for.body ret void } +declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone + +;CHECK-LABEL: @powi_f64( +;CHECK: llvm.powi.v4f64 +;CHECK: ret void +define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv + %0 = load double* %arrayidx, align 8 + %call = tail call double @llvm.powi.f64(double %0, i32 %P) nounwind readnone + %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv + store double %call, double* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;CHECK-LABEL: @powi_f64_neg( +;CHECK-NOT: llvm.powi.v4f64 +;CHECK: ret void +define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv + %0 = load double* %arrayidx, align 8 + %1 = trunc i64 %indvars.iv to i32 + %call = tail call double @llvm.powi.f64(double %0, i32 %1) nounwind readnone + %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv + store double %call, double* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i64 @llvm.cttz.i64 (i64, i1) nounwind readnone + +;CHECK-LABEL: @cttz_f64( +;CHECK: llvm.cttz.v4i64 +;CHECK: ret void +define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv + %0 = load i64* %arrayidx, align 8 + %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone + %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv + store i64 %call, i64* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i64 @llvm.ctlz.i64 (i64, i1) nounwind readnone + +;CHECK-LABEL: @ctlz_f64( +;CHECK: llvm.ctlz.v4i64 +;CHECK: ret void +define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv + %0 = load i64* %arrayidx, align 8 + %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone + %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv + store i64 %call, i64* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} diff --git a/test/Transforms/LoopVectorize/metadata-unroll.ll b/test/Transforms/LoopVectorize/metadata-unroll.ll index 7f1037200659..848f1f9601b9 100644 --- a/test/Transforms/LoopVectorize/metadata-unroll.ll +++ b/test/Transforms/LoopVectorize/metadata-unroll.ll @@ -38,4 +38,4 @@ define void @inc(i32 %n) nounwind uwtable noinline ssp { } !0 = metadata !{metadata !0, metadata !1} -!1 = metadata !{metadata !"llvm.vectorizer.unroll", i32 2} +!1 = metadata !{metadata !"llvm.loop.interleave.count", i32 2} diff --git a/test/Transforms/LoopVectorize/metadata-width.ll b/test/Transforms/LoopVectorize/metadata-width.ll index 1960c0bad6bc..87de655da6f2 100644 --- a/test/Transforms/LoopVectorize/metadata-width.ll +++ b/test/Transforms/LoopVectorize/metadata-width.ll @@ -28,4 +28,4 @@ for.end: ; preds = %for.body, %entry attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } !0 = metadata !{metadata !0, metadata !1} -!1 = metadata !{metadata !"llvm.vectorizer.width", i32 8} +!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 8} diff --git a/test/Transforms/LoopVectorize/metadata.ll b/test/Transforms/LoopVectorize/metadata.ll new file mode 100644 index 000000000000..bdcf1c9fb229 --- /dev/null +++ b/test/Transforms/LoopVectorize/metadata.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !tbaa !0 + %conv = fptosi float %0 to i32 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +; CHECK-LABEL: @test1 +; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa ![[TFLT:[0-9]+]] +; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa ![[TINT:[0-9]+]] +; CHECK: ret i32 0 + +; CHECK-DAG: ![[TFLT]] = metadata !{metadata ![[TFLT1:[0-9]+]] +; CHECK-DAG: ![[TFLT1]] = metadata !{metadata !"float" + +; CHECK-DAG: ![[TINT]] = metadata !{metadata ![[TINT1:[0-9]+]] +; CHECK-DAG: ![[TINT1]] = metadata !{metadata !"int" + +attributes #0 = { nounwind uwtable } + +!0 = metadata !{metadata !1, metadata !1, i64 0} +!1 = metadata !{metadata !"float", metadata !2, i64 0} +!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0} +!3 = metadata !{metadata !"Simple C/C++ TBAA"} +!4 = metadata !{metadata !5, metadata !5, i64 0} +!5 = metadata !{metadata !"int", metadata !2, i64 0} + diff --git a/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll index 5fc5ed55a99d..88a29c50df5a 100644 --- a/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll +++ b/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll @@ -1,6 +1,6 @@ ; RUN: opt -indvars -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S < %s | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" ; We must not vectorize this loop. %add55 is not reduction. Its value is used diff --git a/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/test/Transforms/LoopVectorize/multiple-address-spaces.ll index 7d836dedbdbb..d64662838e00 100644 --- a/test/Transforms/LoopVectorize/multiple-address-spaces.ll +++ b/test/Transforms/LoopVectorize/multiple-address-spaces.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s ; From a simple program with two address spaces: ; char Y[4*10000] __attribute__((address_space(1))); diff --git a/test/Transforms/LoopVectorize/no_array_bounds.ll b/test/Transforms/LoopVectorize/no_array_bounds.ll new file mode 100644 index 000000000000..240b1b5d49dc --- /dev/null +++ b/test/Transforms/LoopVectorize/no_array_bounds.ll @@ -0,0 +1,101 @@ +; RUN: opt < %s -loop-vectorize -S 2>&1 | FileCheck %s + +; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur. +; CHECK: warning: no_array_bounds.cpp:5:5: loop not vectorized: failed explicitly specified loop vectorization +; CHECK: warning: no_array_bounds.cpp:10:5: loop not interleaved: failed explicitly specified loop interleaving + +; #pragma clang loop vectorize(enable) +; for (int i = 0; i < number; i++) { +; A[B[i]]++; +; } + +; #pragma clang loop vectorize(disable) interleave(enable) +; for (int i = 0; i < number; i++) { +; B[A[i]]++; +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: nounwind ssp uwtable +define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 { +entry: + %cmp25 = icmp sgt i32 %number, 0, !dbg !10 + br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !14 + +for.cond5.preheader: ; preds = %for.body + br i1 %cmp25, label %for.body7.preheader, label %for.end15, !dbg !16, !llvm.loop !18 + +for.body7.preheader: ; preds = %for.cond5.preheader + br label %for.body7, !dbg !20 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv27 = phi i64 [ %indvars.iv.next28, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv27, !dbg !14 + %0 = load i32* %arrayidx, align 4, !dbg !14, !tbaa !22 + %idxprom1 = sext i32 %0 to i64, !dbg !14 + %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1, !dbg !14 + %1 = load i32* %arrayidx2, align 4, !dbg !14, !tbaa !22 + %inc = add nsw i32 %1, 1, !dbg !14 + store i32 %inc, i32* %arrayidx2, align 4, !dbg !14, !tbaa !22 + %indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1, !dbg !10 + %lftr.wideiv29 = trunc i64 %indvars.iv.next28 to i32, !dbg !10 + %exitcond30 = icmp eq i32 %lftr.wideiv29, %number, !dbg !10 + br i1 %exitcond30, label %for.cond5.preheader, label %for.body, !dbg !10, !llvm.loop !12 + +for.body7: ; preds = %for.body7.preheader, %for.body7 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body7 ], [ 0, %for.body7.preheader ] + %arrayidx9 = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !20 + %2 = load i32* %arrayidx9, align 4, !dbg !20, !tbaa !22 + %idxprom10 = sext i32 %2 to i64, !dbg !20 + %arrayidx11 = getelementptr inbounds i32* %B, i64 %idxprom10, !dbg !20 + %3 = load i32* %arrayidx11, align 4, !dbg !20, !tbaa !22 + %inc12 = add nsw i32 %3, 1, !dbg !20 + store i32 %inc12, i32* %arrayidx11, align 4, !dbg !20, !tbaa !22 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !16 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !16 + %exitcond = icmp eq i32 %lftr.wideiv, %number, !dbg !16 + br i1 %exitcond, label %for.end15.loopexit, label %for.body7, !dbg !16, !llvm.loop !18 + +for.end15.loopexit: ; preds = %for.body7 + br label %for.end15 + +for.end15: ; preds = %for.end15.loopexit, %entry, %for.cond5.preheader + ret void, !dbg !26 +} + +attributes #0 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!7, !8} +!llvm.ident = !{!9} + +!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} +!1 = metadata !{metadata !"no_array_bounds.cpp", metadata !"."} +!2 = metadata !{} +!3 = metadata !{metadata !4} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*, i32)* @_Z4testPiS_i, null, null, metadata !2, i32 2} +!5 = metadata !{i32 786473, metadata !1} +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} +!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} +!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1} +!9 = metadata !{metadata !"clang version 3.5.0"} +!10 = metadata !{i32 4, i32 8, metadata !11, null} +!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 3, i32 0, i32 0} +!12 = metadata !{metadata !12, metadata !13} +!13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} +!14 = metadata !{i32 5, i32 5, metadata !15, null} +!15 = metadata !{i32 786443, metadata !1, metadata !11, i32 4, i32 36, i32 0, i32 1} +!16 = metadata !{i32 9, i32 8, metadata !17, null} +!17 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 3, i32 0, i32 2} +!18 = metadata !{metadata !18, metadata !13, metadata !19} +!19 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1} +!20 = metadata !{i32 10, i32 5, metadata !21, null} +!21 = metadata !{i32 786443, metadata !1, metadata !17, i32 9, i32 36, i32 0, i32 3} +!22 = metadata !{metadata !23, metadata !23, i64 0} +!23 = metadata !{metadata !"int", metadata !24, i64 0} +!24 = metadata !{metadata !"omnipotent char", metadata !25, i64 0} +!25 = metadata !{metadata !"Simple C/C++ TBAA"} +!26 = metadata !{i32 12, i32 1, metadata !4, null} diff --git a/test/Transforms/LoopVectorize/no_switch.ll b/test/Transforms/LoopVectorize/no_switch.ll new file mode 100644 index 000000000000..8f654e41d4c8 --- /dev/null +++ b/test/Transforms/LoopVectorize/no_switch.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s + +; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement +; CHECK: remark: source.cpp:4:5: loop not vectorized: vectorization is explicitly enabled with width 4 +; CHECK: warning: source.cpp:4:5: loop not vectorized: failed explicitly specified loop vectorization + +; CHECK: _Z11test_switchPii +; CHECK-NOT: x i32> +; CHECK: ret + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: nounwind optsize ssp uwtable +define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 { +entry: + %cmp18 = icmp sgt i32 %Length, 0, !dbg !10 + br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12 + +for.body.preheader: ; preds = %entry + br label %for.body, !dbg !14 + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !14 + %0 = load i32* %arrayidx, align 4, !dbg !14, !tbaa !16 + switch i32 %0, label %for.inc [ + i32 0, label %sw.bb + i32 1, label %sw.bb3 + ], !dbg !14 + +sw.bb: ; preds = %for.body + %1 = trunc i64 %indvars.iv to i32, !dbg !20 + %mul = shl nsw i32 %1, 1, !dbg !20 + br label %for.inc, !dbg !22 + +sw.bb3: ; preds = %for.body + %2 = trunc i64 %indvars.iv to i32, !dbg !23 + store i32 %2, i32* %arrayidx, align 4, !dbg !23, !tbaa !16 + br label %for.inc, !dbg !23 + +for.inc: ; preds = %sw.bb3, %for.body, %sw.bb + %storemerge = phi i32 [ %mul, %sw.bb ], [ 0, %for.body ], [ 0, %sw.bb3 ] + store i32 %storemerge, i32* %arrayidx, align 4, !dbg !20, !tbaa !16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10 + %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !10 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !10, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void, !dbg !24 +} + +attributes #0 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!7, !8} +!llvm.ident = !{!9} + +!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} +!1 = metadata !{metadata !"source.cpp", metadata !"."} +!2 = metadata !{} +!3 = metadata !{metadata !4} +!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_switch", metadata !"test_switch", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z11test_switchPii, null, null, metadata !2, i32 1} +!5 = metadata !{i32 786473, metadata !1} +!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} +!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} +!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1} +!9 = metadata !{metadata !"clang version 3.5.0"} +!10 = metadata !{i32 3, i32 8, metadata !11, null} +!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0} +!12 = metadata !{metadata !12, metadata !13, metadata !13} +!13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} +!14 = metadata !{i32 4, i32 5, metadata !15, null} +!15 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 36, i32 0, i32 1} +!16 = metadata !{metadata !17, metadata !17, i64 0} +!17 = metadata !{metadata !"int", metadata !18, i64 0} +!18 = metadata !{metadata !"omnipotent char", metadata !19, i64 0} +!19 = metadata !{metadata !"Simple C/C++ TBAA"} +!20 = metadata !{i32 6, i32 7, metadata !21, null} +!21 = metadata !{i32 786443, metadata !1, metadata !15, i32 4, i32 18, i32 0, i32 2} +!22 = metadata !{i32 7, i32 5, metadata !21, null} +!23 = metadata !{i32 9, i32 7, metadata !21, null} +!24 = metadata !{i32 14, i32 1, metadata !4, null} diff --git a/test/Transforms/LoopVectorize/ptr_loops.ll b/test/Transforms/LoopVectorize/ptr_loops.ll index 15983f068556..1259e21ebf2e 100644 --- a/test/Transforms/LoopVectorize/ptr_loops.ll +++ b/test/Transforms/LoopVectorize/ptr_loops.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" diff --git a/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-address-space.ll index 6c86561a1c7e..5bf7020a475a 100644 --- a/test/Transforms/LoopVectorize/runtime-check-address-space.ll +++ b/test/Transforms/LoopVectorize/runtime-check-address-space.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s +; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s ; Check vectorization that would ordinarily require a runtime bounds ; check on the pointers when mixing address spaces. For now we cannot diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll index a2b9ad94c837..73b28301b7e7 100644 --- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -5,13 +5,16 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK-LABEL: @add_ints( ;CHECK: br +;CHECK: br ;CHECK: getelementptr -;CHECK-NEXT: getelementptr -;CHECK-NEXT: icmp uge -;CHECK-NEXT: icmp uge -;CHECK-NEXT: icmp uge -;CHECK-NEXT: icmp uge -;CHECK-NEXT: and +;CHECK-DAG: getelementptr +;CHECK-DAG: icmp uge +;CHECK-DAG: icmp uge +;CHECK-DAG: icmp uge +;CHECK-DAG: icmp uge +;CHECK-DAG: and +;CHECK-DAG: and +;CHECK: br ;CHECK: ret define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) { entry: diff --git a/test/Transforms/LoopVectorize/store-shuffle-bug.ll b/test/Transforms/LoopVectorize/store-shuffle-bug.ll index 0ec8010756d1..26f4d156df61 100644 --- a/test/Transforms/LoopVectorize/store-shuffle-bug.ll +++ b/test/Transforms/LoopVectorize/store-shuffle-bug.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s +; RUN: opt -S -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -19,18 +19,13 @@ entry: ; CHECK-LABEL: @t( ; CHECK: vector.body: -; CHECK: load <4 x i32> -; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = shufflevector -; CHECK: load <4 x i32> -; CHECK: [[VAR2:%[a-zA-Z0-9]+]] = shufflevector +; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = load <4 x i32> +; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = load <4 x i32> ; CHECK: [[VAR3:%[a-zA-Z0-9]+]] = add nsw <4 x i32> [[VAR2]], [[VAR1]] -; CHECK: [[VAR4:%[a-zA-Z0-9]+]] = shufflevector <4 x i32> [[VAR3]] -; CHECK: store <4 x i32> [[VAR4]] -; CHECK: load <4 x i32> -; CHECK: [[VAR5:%[a-zA-Z0-9]+]] = shufflevector -; CHECK-NOT: add nsw <4 x i32> [[VAR4]], [[VAR5]] -; CHECK-NOT: add nsw <4 x i32> [[VAR5]], [[VAR4]] -; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR5]] +; CHECK: store <4 x i32> [[VAR3]] +; CHECK: [[VAR4:%[a-zA-Z0-9.]+]] = load <4 x i32> +; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR4]] +; CHECK-NOT: shufflevector for.body: %indvars.iv = phi i64 [ 93, %entry ], [ %indvars.iv.next, %for.body ] diff --git a/test/Transforms/LoopVectorize/tbaa-nodep.ll b/test/Transforms/LoopVectorize/tbaa-nodep.ll new file mode 100644 index 000000000000..f31b3072bc6c --- /dev/null +++ b/test/Transforms/LoopVectorize/tbaa-nodep.ll @@ -0,0 +1,102 @@ +; RUN: opt < %s -tbaa -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s --check-prefix=CHECK-NOTBAA +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !tbaa !0 + %conv = fptosi float %0 to i32 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 + +; TBAA partitions the accesses in this loop, so it can be vectorized without +; runtime checks. + +; CHECK-LABEL: @test1 +; CHECK: entry: +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: + +; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa +; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa + +; CHECK: ret i32 0 + +; CHECK-NOTBAA-LABEL: @test1 +; CHECK-NOTBAA: icmp uge i32* + +; CHECK-NOTBAA: load <4 x float>* %{{.*}}, align 4, !tbaa +; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa + +; CHECK-NOTBAA: ret i32 0 +} + +; Function Attrs: nounwind uwtable +define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !tbaa !0 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4, !tbaa !4 + %conv = sitofp i32 %1 to float + %mul = fmul float %0, %conv + %arrayidx4 = getelementptr inbounds float* %c, i64 %indvars.iv + store float %mul, float* %arrayidx4, align 4, !tbaa !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 + +; This test is like the first, except here there is still one runtime check +; required. Without TBAA, however, two checks are required. + +; CHECK-LABEL: @test2 +; CHECK: icmp uge float* +; CHECK: icmp uge float* +; CHECK-NOT: icmp uge i32* + +; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa +; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa + +; CHECK: ret i32 0 + +; CHECK-NOTBAA-LABEL: @test2 +; CHECK-NOTBAA: icmp uge float* +; CHECK-NOTBAA: icmp uge float* +; CHECK-NOTBAA-DAG: icmp uge float* +; CHECK-NOTBAA-DAG: icmp uge i32* + +; CHECK-NOTBAA: load <4 x float>* %{{.*}}, align 4, !tbaa +; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa + +; CHECK-NOTBAA: ret i32 0 +} + +attributes #0 = { nounwind uwtable } + +!0 = metadata !{metadata !1, metadata !1, i64 0} +!1 = metadata !{metadata !"float", metadata !2, i64 0} +!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0} +!3 = metadata !{metadata !"Simple C/C++ TBAA"} +!4 = metadata !{metadata !5, metadata !5, i64 0} +!5 = metadata !{metadata !"int", metadata !2, i64 0} + diff --git a/test/Transforms/LoopVectorize/unroll_novec.ll b/test/Transforms/LoopVectorize/unroll_novec.ll index 33f128da905d..89f4678526de 100644 --- a/test/Transforms/LoopVectorize/unroll_novec.ll +++ b/test/Transforms/LoopVectorize/unroll_novec.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-unroll=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -12,10 +12,20 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK-LABEL: @inc( ;CHECK: load i32* ;CHECK: load i32* +;CHECK: load i32* +;CHECK: load i32* +;CHECK-NOT: load i32* +;CHECK: add nsw i32 ;CHECK: add nsw i32 ;CHECK: add nsw i32 +;CHECK: add nsw i32 +;CHECK-NOT: add nsw i32 +;CHECK: store i32 +;CHECK: store i32 ;CHECK: store i32 ;CHECK: store i32 +;CHECK-NOT: store i32 +;CHECK: add i64 %{{.*}}, 4 ;CHECK: ret void define void @inc(i32 %n) nounwind uwtable noinline ssp { %1 = icmp sgt i32 %n, 0 diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll index e8d37285f803..6b06afaf0de2 100644 --- a/test/Transforms/LoopVectorize/value-ptr-bug.ll +++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; PR16073 -; Because we were caching value pointers accross a function call that could RAUW +; Because we were caching value pointers across a function call that could RAUW ; we would generate an undefined value store below: ; SCEVExpander::expandCodeFor would change a value (the start value of an ; induction) that we cached in the induction variable list. diff --git a/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/test/Transforms/LoopVectorize/vect.omp.persistence.ll new file mode 100644 index 000000000000..f6465677839e --- /dev/null +++ b/test/Transforms/LoopVectorize/vect.omp.persistence.ll @@ -0,0 +1,88 @@ +; RUN: opt < %s -O2 -force-vector-unroll=2 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Loop from "rotated" +; CHECK: LV: Loop hints: force=enabled +; Loop from "nonrotated" +; CHECK: LV: Loop hints: force=enabled +; No more loops in the module +; CHECK-NOT: LV: Loop hints: force= +; In total only 1 loop should be rotated. +; CHECK: 1 loop-rotate + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; See http://reviews.llvm.org/D3348 for details. + +; +; Test #1 +; +; Ensure that "llvm.loop.vectorize.enable" metadata was not lost prior to LoopVectorize pass. +; In past LoopRotate was clearing that metadata. +; +; The source C code is: +; void rotated(float *a, int size) +; { +; int t = 0; +; #pragma omp simd +; for (int i = 0; i < size; ++i) { +; a[i] = a[i-5] * a[i+2]; +; ++t; +; } +;} + +define void @rotated(float* nocapture %a, i64 %size) { +entry: + %cmp1 = icmp sgt i64 %size, 0 + br i1 %cmp1, label %for.header, label %for.end + +for.header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %cmp2 = icmp sgt i64 %indvars.iv, %size + br i1 %cmp2, label %for.end, label %for.body + +for.body: + + %0 = add nsw i64 %indvars.iv, -5 + %arrayidx = getelementptr inbounds float* %a, i64 %0 + %1 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds float* %a, i64 %2 + %3 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %mul = fmul float %1, %3 + %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !1 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.header, !llvm.loop !1 + +for.end: + ret void +} + +!1 = metadata !{metadata !1, metadata !2} +!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} + +; +; Test #2 +; +; Ensure that "llvm.loop.vectorize.enable" metadata was not lost even +; if loop was not rotated (see http://reviews.llvm.org/D3348#comment-4). +; +define i32 @nonrotated(i32 %a) { +entry: + br label %loop_cond +loop_cond: + %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ] + %cmp = icmp ne i32 %indx, %a + br i1 %cmp, label %return, label %loop_inc +loop_inc: + %inc = add i32 %indx, 1 + br label %loop_cond, !llvm.loop !3 +return: + ret i32 0 +} + +!3 = metadata !{metadata !3, metadata !4} +!4 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true} diff --git a/test/Transforms/LoopVectorize/vect.stats.ll b/test/Transforms/LoopVectorize/vect.stats.ll new file mode 100644 index 000000000000..92ec24f726ee --- /dev/null +++ b/test/Transforms/LoopVectorize/vect.stats.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -loop-vectorize -force-vector-unroll=4 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; +; We have 2 loops, one of them is vectorizable and the second one is not. +; + +; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of loops vectorized + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @vectorized(float* nocapture %a, i64 %size) { +entry: + %cmp1 = icmp sgt i64 %size, 0 + br i1 %cmp1, label %for.header, label %for.end + +for.header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %cmp2 = icmp sgt i64 %indvars.iv, %size + br i1 %cmp2, label %for.end, label %for.body + +for.body: + + %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv + %0 = load float* %arrayidx, align 4 + %mul = fmul float %0, %0 + store float %mul, float* %arrayidx, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.header + +for.end: + ret void +} + +define void @not_vectorized(float* nocapture %a, i64 %size) { +entry: + %cmp1 = icmp sgt i64 %size, 0 + br i1 %cmp1, label %for.header, label %for.end + +for.header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %cmp2 = icmp sgt i64 %indvars.iv, %size + br i1 %cmp2, label %for.end, label %for.body + +for.body: + + %0 = add nsw i64 %indvars.iv, -5 + %arrayidx = getelementptr inbounds float* %a, i64 %0 + %1 = load float* %arrayidx, align 4 + %2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds float* %a, i64 %2 + %3 = load float* %arrayidx2, align 4 + %mul = fmul float %1, %3 + %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx4, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.header + +for.end: + ret void +}
\ No newline at end of file diff --git a/test/Transforms/LoopVectorize/vectorize-once.ll b/test/Transforms/LoopVectorize/vectorize-once.ll index 780046930e1b..97654f495018 100644 --- a/test/Transforms/LoopVectorize/vectorize-once.ll +++ b/test/Transforms/LoopVectorize/vectorize-once.ll @@ -69,9 +69,9 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit: ; preds = %for.body.i, %entry attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" } ; CHECK: !0 = metadata !{metadata !0, metadata !1, metadata !2} -; CHECK: !1 = metadata !{metadata !"llvm.vectorizer.width", i32 1} -; CHECK: !2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 1} +; CHECK: !1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1} +; CHECK: !2 = metadata !{metadata !"llvm.loop.interleave.count", i32 1} ; CHECK: !3 = metadata !{metadata !3, metadata !1, metadata !2} !0 = metadata !{metadata !0, metadata !1} -!1 = metadata !{metadata !"llvm.vectorizer.width", i32 1} +!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1} diff --git a/test/Transforms/LoopVectorize/version-mem-access.ll b/test/Transforms/LoopVectorize/version-mem-access.ll new file mode 100644 index 000000000000..51d20e227ddf --- /dev/null +++ b/test/Transforms/LoopVectorize/version-mem-access.ll @@ -0,0 +1,87 @@ +; RUN: opt -basicaa -loop-vectorize -enable-mem-access-versioning -force-vector-width=2 -force-vector-unroll=1 < %s -S | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: test +define void @test(i32* noalias %A, i64 %AStride, + i32* noalias %B, i32 %BStride, + i32* noalias %C, i64 %CStride, i32 %N) { +entry: + %cmp13 = icmp eq i32 %N, 0 + br i1 %cmp13, label %for.end, label %for.body.preheader + +; CHECK-DAG: icmp ne i64 %AStride, 1 +; CHECK-DAG: icmp ne i32 %BStride, 1 +; CHECK-DAG: icmp ne i64 %CStride, 1 +; CHECK: or +; CHECK: or +; CHECK: br + +; CHECK: vector.body +; CHECK: load <2 x i32> + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %iv.trunc = trunc i64 %indvars.iv to i32 + %mul = mul i32 %iv.trunc, %BStride + %mul64 = zext i32 %mul to i64 + %arrayidx = getelementptr inbounds i32* %B, i64 %mul64 + %0 = load i32* %arrayidx, align 4 + %mul2 = mul nsw i64 %indvars.iv, %CStride + %arrayidx3 = getelementptr inbounds i32* %C, i64 %mul2 + %1 = load i32* %arrayidx3, align 4 + %mul4 = mul nsw i32 %1, %0 + %mul3 = mul nsw i64 %indvars.iv, %AStride + %arrayidx7 = getelementptr inbounds i32* %A, i64 %mul3 + store i32 %mul4, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; We used to crash on this function because we removed the fptosi cast when +; replacing the symbolic stride '%conv'. +; PR18480 + +; CHECK-LABEL: fn1 +; CHECK: load <2 x double> + +define void @fn1(double* noalias %x, double* noalias %c, double %a) { +entry: + %conv = fptosi double %a to i32 + %cmp8 = icmp sgt i32 %conv, 0 + br i1 %cmp8, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %0 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %conv + %idxprom = sext i32 %mul to i64 + %arrayidx = getelementptr inbounds double* %x, i64 %idxprom + %1 = load double* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds double* %c, i64 %indvars.iv + store double %1, double* %arrayidx3, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %conv + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} |
