summaryrefslogtreecommitdiff
path: root/test/Transforms/LoopVectorize/X86
diff options
context:
space:
mode:
Diffstat (limited to 'test/Transforms/LoopVectorize/X86')
-rw-r--r--test/Transforms/LoopVectorize/X86/already-vectorized.ll46
-rw-r--r--test/Transforms/LoopVectorize/X86/avx1.ll4
-rw-r--r--test/Transforms/LoopVectorize/X86/conversion-cost.ll4
-rw-r--r--test/Transforms/LoopVectorize/X86/cost-model.ll2
-rw-r--r--test/Transforms/LoopVectorize/X86/gather-cost.ll86
-rw-r--r--test/Transforms/LoopVectorize/X86/gcc-examples.ll8
-rw-r--r--test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll6
-rw-r--r--test/Transforms/LoopVectorize/X86/lit.local.cfg2
-rw-r--r--test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll2
-rw-r--r--test/Transforms/LoopVectorize/X86/parallel-loops.ll10
-rw-r--r--test/Transforms/LoopVectorize/X86/rauw-bug.ll33
-rw-r--r--test/Transforms/LoopVectorize/X86/reduction-crash.ll2
-rw-r--r--test/Transforms/LoopVectorize/X86/small-size.ll10
-rw-r--r--test/Transforms/LoopVectorize/X86/tripcount.ll39
-rw-r--r--test/Transforms/LoopVectorize/X86/unroll-pm.ll31
-rw-r--r--test/Transforms/LoopVectorize/X86/unroll-small-loops.ll4
-rw-r--r--test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll2
17 files changed, 262 insertions, 29 deletions
diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
new file mode 100644
index 0000000000000..885418c0fdd9e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; We want to make sure that we don't even try to vectorize loops again
+; The vectorizer used to mark the un-vectorized loop only as already vectorized
+; thus, trying to vectorize the vectorized loop again
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global [255 x i32]
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @vect() {
+; CHECK: LV: Checking a loop in "vect"
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds [255 x i32]* @a, i64 0, i64 %indvars.iv
+ %0 = load i32* %arrayidx, align 4
+ %add = add nsw i32 %0, %red.05
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 255
+ br i1 %exitcond, label %for.end, label %for.body
+
+; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]]
+
+for.end: ; preds = %for.body
+ ret i32 %add
+}
+
+; Now, we check for the Hint metadata
+; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]}
+; CHECK: [[width]] = metadata !{metadata !"llvm.vectorizer.width", i32 1}
+; CHECK: [[unroll]] = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}
+; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]}
+
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
index 6c0366eae9731..01c912567b61b 100644
--- a/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -3,7 +3,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
-;CHECK: @read_mod_write_single_ptr
+;CHECK-LABEL: @read_mod_write_single_ptr(
;CHECK: load <8 x float>
;CHECK: ret i32
define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
@@ -26,7 +26,7 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta
}
-;CHECK: @read_mod_i64
+;CHECK-LABEL: @read_mod_i64(
;CHECK: load <2 x i64>
;CHECK: ret i32
define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 760d28deaf275..0af562db84793 100644
--- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -3,7 +3,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
-;CHECK: @conversion_cost1
+;CHECK-LABEL: @conversion_cost1(
;CHECK: store <32 x i8>
;CHECK: ret
define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
@@ -24,7 +24,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
ret i32 undef
}
-;CHECK: @conversion_cost2
+;CHECK-LABEL: @conversion_cost2(
;CHECK: <2 x float>
;CHECK: ret
define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
index b7f479acf9625..98718e1e97083 100644
--- a/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
@a = common global [2048 x i32] zeroinitializer, align 16
; The program below gathers and scatters data. We better not vectorize it.
-;CHECK: cost_model_1
+;CHECK-LABEL: @cost_model_1(
;CHECK-NOT: <2 x i32>
;CHECK-NOT: <4 x i32>
;CHECK-NOT: <8 x i32>
diff --git a/test/Transforms/LoopVectorize/X86/gather-cost.ll b/test/Transforms/LoopVectorize/X86/gather-cost.ll
new file mode 100644
index 0000000000000..09363d65eefcc
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gather-cost.ll
@@ -0,0 +1,86 @@
+; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+ %cmp53 = icmp eq i64 %size, 0
+ br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+ %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+ %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+ %add = add i64 %v.055, %offset
+ %mul = mul i64 %add, 3
+ %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
+ %0 = load float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
+ %1 = load float* %arrayidx2, align 4
+ %mul3 = fmul fast float %0, %1
+ %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
+ %2 = load float* %arrayidx4, align 4
+ %mul5 = fmul fast float %mul3, %2
+ %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
+ %3 = load float* %arrayidx6, align 4
+ %mul7 = fmul fast float %mul5, %3
+ %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
+ %4 = load float* %arrayidx8, align 4
+ %mul9 = fmul fast float %mul7, %4
+ %add10 = fadd fast float %r.057, %mul9
+ %arrayidx.sum = add i64 %mul, 1
+ %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+ %5 = load float* %arrayidx11, align 4
+ %mul13 = fmul fast float %1, %5
+ %mul15 = fmul fast float %2, %mul13
+ %mul17 = fmul fast float %3, %mul15
+ %mul19 = fmul fast float %4, %mul17
+ %add20 = fadd fast float %g.056, %mul19
+ %arrayidx.sum52 = add i64 %mul, 2
+ %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+ %6 = load float* %arrayidx21, align 4
+ %mul23 = fmul fast float %1, %6
+ %mul25 = fmul fast float %2, %mul23
+ %mul27 = fmul fast float %3, %mul25
+ %mul29 = fmul fast float %4, %mul27
+ %add30 = fadd fast float %b.054, %mul29
+ %inc = add i64 %v.055, 1
+ %exitcond = icmp ne i64 %inc, %size
+ br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+ %add30.lcssa = phi float [ %add30, %for.body ]
+ %add20.lcssa = phi float [ %add20, %for.body ]
+ %add10.lcssa = phi float [ %add10, %for.body ]
+ %phitmp = fptoui float %add10.lcssa to i8
+ %phitmp60 = fptoui float %add20.lcssa to i8
+ %phitmp61 = fptoui float %add30.lcssa to i8
+ br label %for.end
+
+for.end:
+ %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ store i8 %r.0.lcssa, i8* @r_, align 1
+ store i8 %g.0.lcssa, i8* @g_, align 1
+ store i8 %b.0.lcssa, i8* @b_, align 1
+ ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index d2d0eac305f56..e1113fdd911c0 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -9,13 +9,13 @@ target triple = "x86_64-apple-macosx10.8.0"
@a = common global [2048 x i32] zeroinitializer, align 16
; Select VF = 8;
-;CHECK: @example1
+;CHECK-LABEL: @example1(
;CHECK: load <4 x i32>
;CHECK: add nsw <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
-;UNROLL: @example1
+;UNROLL-LABEL: @example1(
;UNROLL: load <4 x i32>
;UNROLL: load <4 x i32>
;UNROLL: add nsw <4 x i32>
@@ -45,12 +45,12 @@ define void @example1() nounwind uwtable ssp {
}
; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
-;CHECK: @example10b
+;CHECK-LABEL: @example10b(
;CHECK: load <4 x i16>
;CHECK: sext <4 x i16>
;CHECK: store <4 x i32>
;CHECK: ret void
-;UNROLL: @example10b
+;UNROLL-LABEL: @example10b(
;UNROLL: load <4 x i16>
;UNROLL: load <4 x i16>
;UNROLL: store <4 x i32>
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 47a5e7aee4c1b..d6120e76cc0b7 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -3,7 +3,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-;CHECK: @foo
+;CHECK-LABEL: @foo(
;CHECK-NOT: <4 x i32>
;CHECK: ret void
@@ -21,7 +21,7 @@ for.end.us: ; preds = %for.body3.us
%indvars.iv.next34 = add i64 %indvars.iv33, 1
%lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
%exitcond36 = icmp eq i32 %lftr.wideiv35, %m
- br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop.parallel !5
+ br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
for.body3.us: ; preds = %for.body3.us, %for.body3.lr.ph.us
%indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
@@ -35,7 +35,7 @@ for.body3.us: ; preds = %for.body3.us, %for.
%indvars.iv.next30 = add i64 %indvars.iv29, 1
%lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
%exitcond32 = icmp eq i32 %lftr.wideiv31, %m
- br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop.parallel !4
+ br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
for.body3.lr.ph.us: ; preds = %for.end.us, %entry
%indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
index a8ad0f1a28b23..ba763cf03ffcc 100644
--- a/test/Transforms/LoopVectorize/X86/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
targets = set(config.root.targets_to_build.split())
if not 'X86' in targets:
config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
index f904a8e0b1173..2c47fcb4d3890 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -35,7 +35,7 @@ for.body: ; preds = %for.body.for.body_c
%indvars.iv.next.reload = load i64* %indvars.iv.next.reg2mem
%lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32
%exitcond = icmp eq i32 %lftr.wideiv, 512
- br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop.parallel !3
+ br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop !3
for.body.for.body_crit_edge: ; preds = %for.body
%indvars.iv.next.reload2 = load i64* %indvars.iv.next.reg2mem
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index 3f1a071e69fa8..7e156a9edad4d 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -12,7 +12,7 @@ target triple = "x86_64-unknown-linux-gnu"
; }
;}
-;CHECK: @loop
+;CHECK-LABEL: @loop(
;CHECK-NOT: <4 x i32>
define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
entry:
@@ -42,7 +42,7 @@ for.end: ; preds = %for.body
; The same loop with parallel loop metadata added to the loop branch
; and the memory instructions.
-;CHECK: @parallel_loop
+;CHECK-LABEL: @parallel_loop(
;CHECK: <4 x i32>
define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
entry:
@@ -65,7 +65,7 @@ for.body: ; preds = %for.body, %entry
store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 512
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !3
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
for.end: ; preds = %for.body
ret void
@@ -74,7 +74,7 @@ for.end: ; preds = %for.body
; The same loop with an illegal parallel loop metadata: the memory
; accesses refer to a different loop's identifier.
-;CHECK: @mixed_metadata
+;CHECK-LABEL: @mixed_metadata(
;CHECK-NOT: <4 x i32>
define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
@@ -98,7 +98,7 @@ for.body: ; preds = %for.body, %entry
store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !6
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, 512
- br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !6
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
for.end: ; preds = %for.body
ret void
diff --git a/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/test/Transforms/LoopVectorize/X86/rauw-bug.ll
new file mode 100644
index 0000000000000..4284fbacfa7e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -0,0 +1,33 @@
+; RUN: opt -slp-vectorizer -S %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; This test used to fail under libgmalloc. Because we would try to access a
+; pointer that was already deleted.
+;
+; llvm-lit -v --param use_gmalloc=1 --param
+; gmalloc_path=/usr/lib/libgmalloc.dylib
+; test/Transforms/LoopVectorize/X86/rauw-bug.ll
+;
+; radar://15498655
+
+; CHECK: reduced
+define void @reduced() {
+entry:
+ br i1 undef, label %while.body, label %while.cond63.preheader.while.end76_crit_edge
+
+while.cond63.preheader.while.end76_crit_edge:
+ ret void
+
+while.body:
+ %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
+ %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
+ %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
+ %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+ %div50 = fmul double %d3_fx.012, 1.250000e-01
+ %sub52 = fsub double 0.000000e+00, %div50
+ %div56 = fmul double %d3_fy.013, 1.250000e-01
+ %sub58 = fsub double 0.000000e+00, %div56
+ br label %while.body
+}
diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
index f580846a0228b..3957a55414227 100644
--- a/test/Transforms/LoopVectorize/X86/reduction-crash.ll
+++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -5,7 +5,7 @@ target triple = "i386-apple-darwin"
; PR15344
define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
-; CHECK: @test1
+; CHECK-LABEL: @test1(
; CHECK: preheader
; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
; CHECK: vector.memcheck
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index f390b33c03884..14ac417bb573d 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -20,7 +20,7 @@ target triple = "x86_64-apple-macosx10.8.0"
@dj = common global [1024 x i32] zeroinitializer, align 16
; We can optimize this test without a tail.
-;CHECK: @example1
+;CHECK-LABEL: @example1(
;CHECK: load <4 x i32>
;CHECK: add nsw <4 x i32>
;CHECK: store <4 x i32>
@@ -47,7 +47,7 @@ define void @example1() optsize {
}
; Can't vectorize in 'optsize' mode because we need a tail.
-;CHECK: @example2
+;CHECK-LABEL: @example2(
;CHECK-NOT: store <4 x i32>
;CHECK: ret void
define void @example2(i32 %n, i32 %x) optsize {
@@ -92,7 +92,7 @@ define void @example2(i32 %n, i32 %x) optsize {
}
; N is unknown, we need a tail. Can't vectorize.
-;CHECK: @example3
+;CHECK-LABEL: @example3(
;CHECK-NOT: <4 x i32>
;CHECK: ret void
define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
@@ -117,7 +117,7 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
; We can't vectorize this one because we need a runtime ptr check.
-;CHECK: @example23
+;CHECK-LABEL: @example23(
;CHECK-NOT: <4 x i32>
;CHECK: ret void
define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
@@ -143,7 +143,7 @@ define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
; We CAN vectorize this example because the pointers are marked as noalias.
-;CHECK: @example23b
+;CHECK-LABEL: @example23b(
;CHECK: <4 x i32>
;CHECK: ret void
define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
diff --git a/test/Transforms/LoopVectorize/X86/tripcount.ll b/test/Transforms/LoopVectorize/X86/tripcount.ll
new file mode 100644
index 0000000000000..6b38bacf88884
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/tripcount.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -mcpu=prescott < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd11.0"
+
+@big = external global [0 x i32]
+
+; PR18049
+; We need to truncate the exit count to i32. This is legal because the
+; arithmetic is signed (%inc is nsw).
+
+; CHECK-LABEL: tripcount
+; CHECK: trunc i64 %count to i32
+
+define void @tripcount(i64 %count) {
+entry:
+ %cmp6 = icmp sgt i64 %count, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds [0 x i32]* @big, i32 0, i32 %i.07
+ %0 = load i32* %arrayidx, align 4
+ %neg = xor i32 %0, -1
+ store i32 %neg, i32* %arrayidx, align 4
+ %inc = add nsw i32 %i.07, 1
+ %conv = sext i32 %inc to i64
+ %cmp = icmp slt i64 %conv, %count
+ br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/test/Transforms/LoopVectorize/X86/unroll-pm.ll
new file mode 100644
index 0000000000000..5064fec286ce0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -disable-loop-unrolling -S | FileCheck %s -check-prefix=CHECK-NOUNRL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK-LABEL: @bar(
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+;CHECK-NOUNRL-LABEL: @bar(
+;CHECK-NOUNRL: store <4 x i32>
+;CHECK-NOUNRL-NOT: store <4 x i32>
+;CHECK-NOUNRL: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0, %.lr.ph
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+ %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+ %3 = load i32* %2, align 4
+ %4 = add nsw i32 %3, 6
+ store i32 %4, i32* %2, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
index ef63a145d0c1e..ea107dc4dc51e 100644
--- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -2,7 +2,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
-;CHECK: @foo
+;CHECK-LABEL: @foo(
;CHECK: load <4 x i32>
;CHECK-NOT: load <4 x i32>
;CHECK: store <4 x i32>
@@ -26,7 +26,7 @@ define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
ret i32 undef
}
-;CHECK: @bar
+;CHECK-LABEL: @bar(
;CHECK: store <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret
diff --git a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index b66119f4ef59e..efc93d94a7c51 100644
--- a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.7.0"
@x = common global [1024 x x86_fp80] zeroinitializer, align 16
-;CHECK: @example
+;CHECK-LABEL: @example(
;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
;CHECK: store
;CHECK: ret void