diff options
Diffstat (limited to 'test/Transforms/LoopVectorize/X86')
17 files changed, 262 insertions, 29 deletions
diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll new file mode 100644 index 0000000000000..885418c0fdd9e --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s +; REQUIRES: asserts +; We want to make sure that we don't even try to vectorize loops again +; The vectorizer used to mark the un-vectorized loop only as already vectorized +; thus, trying to vectorize the vectorized loop again + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = external global [255 x i32] + +; Function Attrs: nounwind readonly uwtable +define i32 @vect() { +; CHECK: LV: Checking a loop in "vect" +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry +; We need to make sure we did vectorize the loop +; CHECK: LV: Found a loop: for.body +; CHECK: LV: We can vectorize this loop! + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [255 x i32]* @a, i64 0, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %red.05 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 255 + br i1 %exitcond, label %for.end, label %for.body + +; If it did, we have two loops: +; CHECK: vector.body: +; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]] +; CHECK: for.body: +; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]] + +for.end: ; preds = %for.body + ret i32 %add +} + +; Now, we check for the Hint metadata +; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]} +; CHECK: [[width]] = metadata !{metadata !"llvm.vectorizer.width", i32 1} +; CHECK: [[unroll]] = metadata !{metadata !"llvm.vectorizer.unroll", i32 1} +; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]} + diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll index 6c0366eae9731..01c912567b61b 100644 --- a/test/Transforms/LoopVectorize/X86/avx1.ll +++ b/test/Transforms/LoopVectorize/X86/avx1.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -;CHECK: @read_mod_write_single_ptr +;CHECK-LABEL: @read_mod_write_single_ptr( ;CHECK: load <8 x float> ;CHECK: ret i32 define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp { @@ -26,7 +26,7 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta } -;CHECK: @read_mod_i64 +;CHECK-LABEL: @read_mod_i64( ;CHECK: load <2 x i64> ;CHECK: ret i32 define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp { diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 760d28deaf275..0af562db84793 100644 --- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -;CHECK: @conversion_cost1 +;CHECK-LABEL: @conversion_cost1( ;CHECK: store <32 x i8> ;CHECK: ret define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { @@ -24,7 +24,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun ret i32 undef } -;CHECK: @conversion_cost2 +;CHECK-LABEL: @conversion_cost2( ;CHECK: <2 x float> ;CHECK: ret define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll index b7f479acf9625..98718e1e97083 100644 --- a/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0" @a = common global [2048 x i32] zeroinitializer, align 16 ; The program below gathers and scatters data. We better not vectorize it. -;CHECK: cost_model_1 +;CHECK-LABEL: @cost_model_1( ;CHECK-NOT: <2 x i32> ;CHECK-NOT: <4 x i32> ;CHECK-NOT: <8 x i32> diff --git a/test/Transforms/LoopVectorize/X86/gather-cost.ll b/test/Transforms/LoopVectorize/X86/gather-cost.ll new file mode 100644 index 0000000000000..09363d65eefcc --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/gather-cost.ll @@ -0,0 +1,86 @@ +; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +@kernel = global [512 x float] zeroinitializer, align 16 +@kernel2 = global [512 x float] zeroinitializer, align 16 +@kernel3 = global [512 x float] zeroinitializer, align 16 +@kernel4 = global [512 x float] zeroinitializer, align 16 +@src_data = global [1536 x float] zeroinitializer, align 16 +@r_ = global i8 0, align 1 +@g_ = global i8 0, align 1 +@b_ = global i8 0, align 1 + +; We don't want to vectorize most loops containing gathers because they are +; expensive. This function represents a point where vectorization starts to +; become beneficial. +; Make sure we are conservative and don't vectorize it. +; CHECK-NOT: x float> + +define void @_Z4testmm(i64 %size, i64 %offset) { +entry: + %cmp53 = icmp eq i64 %size, 0 + br i1 %cmp53, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.body: + %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] + %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] + %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] + %add = add i64 %v.055, %offset + %mul = mul i64 %add, 3 + %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul + %0 = load float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055 + %1 = load float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %1 + %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055 + %2 = load float* %arrayidx4, align 4 + %mul5 = fmul fast float %mul3, %2 + %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055 + %3 = load float* %arrayidx6, align 4 + %mul7 = fmul fast float %mul5, %3 + %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055 + %4 = load float* %arrayidx8, align 4 + %mul9 = fmul fast float %mul7, %4 + %add10 = fadd fast float %r.057, %mul9 + %arrayidx.sum = add i64 %mul, 1 + %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum + %5 = load float* %arrayidx11, align 4 + %mul13 = fmul fast float %1, %5 + %mul15 = fmul fast float %2, %mul13 + %mul17 = fmul fast float %3, %mul15 + %mul19 = fmul fast float %4, %mul17 + %add20 = fadd fast float %g.056, %mul19 + %arrayidx.sum52 = add i64 %mul, 2 + %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52 + %6 = load float* %arrayidx21, align 4 + %mul23 = fmul fast float %1, %6 + %mul25 = fmul fast float %2, %mul23 + %mul27 = fmul fast float %3, %mul25 + %mul29 = fmul fast float %4, %mul27 + %add30 = fadd fast float %b.054, %mul29 + %inc = add i64 %v.055, 1 + %exitcond = icmp ne i64 %inc, %size + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: + %add30.lcssa = phi float [ %add30, %for.body ] + %add20.lcssa = phi float [ %add20, %for.body ] + %add10.lcssa = phi float [ %add10, %for.body ] + %phitmp = fptoui float %add10.lcssa to i8 + %phitmp60 = fptoui float %add20.lcssa to i8 + %phitmp61 = fptoui float %add30.lcssa to i8 + br label %for.end + +for.end: + %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] + store i8 %r.0.lcssa, i8* @r_, align 1 + store i8 %g.0.lcssa, i8* @g_, align 1 + store i8 %b.0.lcssa, i8* @b_, align 1 + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll index d2d0eac305f56..e1113fdd911c0 100644 --- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll @@ -9,13 +9,13 @@ target triple = "x86_64-apple-macosx10.8.0" @a = common global [2048 x i32] zeroinitializer, align 16 ; Select VF = 8; -;CHECK: @example1 +;CHECK-LABEL: @example1( ;CHECK: load <4 x i32> ;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret void -;UNROLL: @example1 +;UNROLL-LABEL: @example1( ;UNROLL: load <4 x i32> ;UNROLL: load <4 x i32> ;UNROLL: add nsw <4 x i32> @@ -45,12 +45,12 @@ define void @example1() nounwind uwtable ssp { } ; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. -;CHECK: @example10b +;CHECK-LABEL: @example10b( ;CHECK: load <4 x i16> ;CHECK: sext <4 x i16> ;CHECK: store <4 x i32> ;CHECK: ret void -;UNROLL: @example10b +;UNROLL-LABEL: @example10b( ;UNROLL: load <4 x i16> ;UNROLL: load <4 x i16> ;UNROLL: store <4 x i32> diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 47a5e7aee4c1b..d6120e76cc0b7 100644 --- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -;CHECK: @foo +;CHECK-LABEL: @foo( ;CHECK-NOT: <4 x i32> ;CHECK: ret void @@ -21,7 +21,7 @@ for.end.us: ; preds = %for.body3.us %indvars.iv.next34 = add i64 %indvars.iv33, 1 %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32 %exitcond36 = icmp eq i32 %lftr.wideiv35, %m - br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop.parallel !5 + br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5 for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ] @@ -35,7 +35,7 @@ for.body3.us: ; preds = %for.body3.us, %for. %indvars.iv.next30 = add i64 %indvars.iv29, 1 %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 %exitcond32 = icmp eq i32 %lftr.wideiv31, %m - br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop.parallel !4 + br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4 for.body3.lr.ph.us: ; preds = %for.end.us, %entry %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ] diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg index a8ad0f1a28b23..ba763cf03ffcc 100644 --- a/test/Transforms/LoopVectorize/X86/lit.local.cfg +++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg @@ -1,5 +1,3 @@ -config.suffixes = ['.ll', '.c', '.cpp'] - targets = set(config.root.targets_to_build.split()) if not 'X86' in targets: config.unsupported = True diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll index f904a8e0b1173..2c47fcb4d3890 100644 --- a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll +++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll @@ -35,7 +35,7 @@ for.body: ; preds = %for.body.for.body_c %indvars.iv.next.reload = load i64* %indvars.iv.next.reg2mem %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32 %exitcond = icmp eq i32 %lftr.wideiv, 512 - br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop.parallel !3 + br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop !3 for.body.for.body_crit_edge: ; preds = %for.body %indvars.iv.next.reload2 = load i64* %indvars.iv.next.reg2mem diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll index 3f1a071e69fa8..7e156a9edad4d 100644 --- a/test/Transforms/LoopVectorize/X86/parallel-loops.ll +++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll @@ -12,7 +12,7 @@ target triple = "x86_64-unknown-linux-gnu" ; } ;} -;CHECK: @loop +;CHECK-LABEL: @loop( ;CHECK-NOT: <4 x i32> define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { entry: @@ -42,7 +42,7 @@ for.end: ; preds = %for.body ; The same loop with parallel loop metadata added to the loop branch ; and the memory instructions. -;CHECK: @parallel_loop +;CHECK-LABEL: @parallel_loop( ;CHECK: <4 x i32> define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { entry: @@ -65,7 +65,7 @@ for.body: ; preds = %for.body, %entry store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, 512 - br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !3 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3 for.end: ; preds = %for.body ret void @@ -74,7 +74,7 @@ for.end: ; preds = %for.body ; The same loop with an illegal parallel loop metadata: the memory ; accesses refer to a different loop's identifier. -;CHECK: @mixed_metadata +;CHECK-LABEL: @mixed_metadata( ;CHECK-NOT: <4 x i32> define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { @@ -98,7 +98,7 @@ for.body: ; preds = %for.body, %entry store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !6 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, 512 - br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !6 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6 for.end: ; preds = %for.body ret void diff --git a/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/test/Transforms/LoopVectorize/X86/rauw-bug.ll new file mode 100644 index 0000000000000..4284fbacfa7e5 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/rauw-bug.ll @@ -0,0 +1,33 @@ +; RUN: opt -slp-vectorizer -S %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" +target triple = "x86_64-apple-macosx" + +; This test used to fail under libgmalloc. Because we would try to access a +; pointer that was already deleted. +; +; llvm-lit -v --param use_gmalloc=1 --param +; gmalloc_path=/usr/lib/libgmalloc.dylib +; test/Transforms/LoopVectorize/X86/rauw-bug.ll +; +; radar://15498655 + +; CHECK: reduced +define void @reduced() { +entry: + br i1 undef, label %while.body, label %while.cond63.preheader.while.end76_crit_edge + +while.cond63.preheader.while.end76_crit_edge: + ret void + +while.body: + %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ] + %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ] + %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ] + %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ] + %div50 = fmul double %d3_fx.012, 1.250000e-01 + %sub52 = fsub double 0.000000e+00, %div50 + %div56 = fmul double %d3_fy.013, 1.250000e-01 + %sub58 = fsub double 0.000000e+00, %div56 + br label %while.body +} diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll index f580846a0228b..3957a55414227 100644 --- a/test/Transforms/LoopVectorize/X86/reduction-crash.ll +++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -5,7 +5,7 @@ target triple = "i386-apple-darwin" ; PR15344 define void @test1(float* nocapture %arg, i32 %arg1) nounwind { -; CHECK: @test1 +; CHECK-LABEL: @test1( ; CHECK: preheader ; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0 ; CHECK: vector.memcheck diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll index f390b33c03884..14ac417bb573d 100644 --- a/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/test/Transforms/LoopVectorize/X86/small-size.ll @@ -20,7 +20,7 @@ target triple = "x86_64-apple-macosx10.8.0" @dj = common global [1024 x i32] zeroinitializer, align 16 ; We can optimize this test without a tail. -;CHECK: @example1 +;CHECK-LABEL: @example1( ;CHECK: load <4 x i32> ;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> @@ -47,7 +47,7 @@ define void @example1() optsize { } ; Can't vectorize in 'optsize' mode because we need a tail. -;CHECK: @example2 +;CHECK-LABEL: @example2( ;CHECK-NOT: store <4 x i32> ;CHECK: ret void define void @example2(i32 %n, i32 %x) optsize { @@ -92,7 +92,7 @@ define void @example2(i32 %n, i32 %x) optsize { } ; N is unknown, we need a tail. Can't vectorize. -;CHECK: @example3 +;CHECK-LABEL: @example3( ;CHECK-NOT: <4 x i32> ;CHECK: ret void define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize { @@ -117,7 +117,7 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture ; We can't vectorize this one because we need a runtime ptr check. -;CHECK: @example23 +;CHECK-LABEL: @example23( ;CHECK-NOT: <4 x i32> ;CHECK: ret void define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize { @@ -143,7 +143,7 @@ define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize { ; We CAN vectorize this example because the pointers are marked as noalias. -;CHECK: @example23b +;CHECK-LABEL: @example23b( ;CHECK: <4 x i32> ;CHECK: ret void define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize { diff --git a/test/Transforms/LoopVectorize/X86/tripcount.ll b/test/Transforms/LoopVectorize/X86/tripcount.ll new file mode 100644 index 0000000000000..6b38bacf88884 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/tripcount.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -mcpu=prescott < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" +target triple = "i386-unknown-freebsd11.0" + +@big = external global [0 x i32] + +; PR18049 +; We need to truncate the exit count to i32. This is legal because the +; arithmetic is signed (%inc is nsw). + +; CHECK-LABEL: tripcount +; CHECK: trunc i64 %count to i32 + +define void @tripcount(i64 %count) { +entry: + %cmp6 = icmp sgt i64 %count, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds [0 x i32]* @big, i32 0, i32 %i.07 + %0 = load i32* %arrayidx, align 4 + %neg = xor i32 %0, -1 + store i32 %neg, i32* %arrayidx, align 4 + %inc = add nsw i32 %i.07, 1 + %conv = sext i32 %inc to i64 + %cmp = icmp slt i64 %conv, %count + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/test/Transforms/LoopVectorize/X86/unroll-pm.ll new file mode 100644 index 0000000000000..5064fec286ce0 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/unroll-pm.ll @@ -0,0 +1,31 @@ +; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -S | FileCheck %s +; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -disable-loop-unrolling -S | FileCheck %s -check-prefix=CHECK-NOUNRL + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" +;CHECK-LABEL: @bar( +;CHECK: store <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +;CHECK-NOUNRL-LABEL: @bar( +;CHECK-NOUNRL: store <4 x i32> +;CHECK-NOUNRL-NOT: store <4 x i32> +;CHECK-NOUNRL: ret +define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = add nsw i32 %3, 6 + store i32 %4, i32* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll index ef63a145d0c1e..ea107dc4dc51e 100644 --- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll +++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -2,7 +2,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -;CHECK: @foo +;CHECK-LABEL: @foo( ;CHECK: load <4 x i32> ;CHECK-NOT: load <4 x i32> ;CHECK: store <4 x i32> @@ -26,7 +26,7 @@ define i32 @foo(i32* nocapture %A) nounwind uwtable ssp { ret i32 undef } -;CHECK: @bar +;CHECK-LABEL: @bar( ;CHECK: store <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret diff --git a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll index b66119f4ef59e..efc93d94a7c51 100644 --- a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll +++ b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll @@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.7.0" @x = common global [1024 x x86_fp80] zeroinitializer, align 16 -;CHECK: @example +;CHECK-LABEL: @example( ;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>* ;CHECK: store ;CHECK: ret void |