From 71d5a2540a98c81f5bcaeb48805e0e2881f530ef Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 16 Apr 2017 16:01:22 +0000 Subject: Vendor import of llvm trunk r300422: https://llvm.org/svn/llvm-project/llvm/trunk@300422 --- test/Transforms/LoopVectorize/X86/avx512.ll | 2 +- .../LoopVectorize/X86/consecutive-ptr-uniforms.ll | 43 +++++---- .../LoopVectorize/X86/gather-vs-interleave.ll | 41 ++++++++ .../LoopVectorize/X86/int128_no_gather.ll | 4 +- test/Transforms/LoopVectorize/X86/interleaving.ll | 1 + .../LoopVectorize/X86/metadata-enable.ll | 14 ++- test/Transforms/LoopVectorize/X86/scatter_crash.ll | 106 ++++----------------- .../X86/vectorization-remarks-missed.ll | 57 +++++++++++ .../X86/vectorization-remarks-profitable.ll | 2 +- 9 files changed, 158 insertions(+), 112 deletions(-) create mode 100644 test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll (limited to 'test/Transforms/LoopVectorize/X86') diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll index fb01454c253b..1eb1cd3f5d7a 100644 --- a/test/Transforms/LoopVectorize/X86/avx512.ll +++ b/test/Transforms/LoopVectorize/X86/avx512.ll @@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.9.0" ; loop. ; CHECK-LABEL: f: -; CHECK: vmovups %zmm{{.}}, +; CHECK: vmovdqu32 %zmm{{.}}, ; CHECK-NOT: %ymm define void @f(i32* %a, i32 %n) { diff --git a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index 32bfcd2275ac..82f2e064a581 100644 --- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -13,22 +13,33 @@ target triple = "x86_64-unknown-linux-gnu" ; scatter operation. %tmp3 (and the induction variable) should not be marked ; uniform-after-vectorization. ; -; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 -; CHECK: vector.body: -; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0 -; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] -; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 -; CHECK: %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]] -; CHECK: %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T4]], align 4 -; CHECK: %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind -; CHECK: call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}}) -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]] +; CHECK-NEXT: call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body %data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] } diff --git a/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll new file mode 100644 index 000000000000..76b6cae5c3b4 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll @@ -0,0 +1,41 @@ +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test checks that "gather" operation is choosen since it's cost is better +; than interleaving pattern. +; +;unsigned long A[SIZE]; +;unsigned long B[SIZE]; +; +;void foo() { +; for (int i=0; i:1: ; preds = %0, %1 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv + %3 = load i64, i64* %2, align 16 + %4 = add i64 %3, 5 + %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv + store i64 %4, i64* %5, align 16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %6 = icmp slt i64 %indvars.iv.next, 1024 + br i1 %6, label %1, label %7 + +;