; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; This test verifies that the loop vectorizer will NOT vectorize loops that ; will produce a tail loop with the optimize for size or the minimize size ; attributes. This is a target-dependent version of the test. ; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s ; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" @tab = common global [32 x i8] zeroinitializer, align 1 define i32 @foo_optsize() #0 { ; CHECK-LABEL: @foo_optsize( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2 ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; entry: br label %for.body for.body: ; preds = %for.body, %entry %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 %0 = load i8, i8* %arrayidx, align 1 %cmp1 = icmp eq i8 %0, 0 %. = select i1 %cmp1, i8 2, i8 1 store i8 %., i8* %arrayidx, align 1 %inc = add nsw i32 %i.08, 1 %exitcond = icmp eq i32 %i.08, 202 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body ret i32 0 } attributes #0 = { optsize } define i32 @foo_minsize() #1 { ; CHECK-LABEL: @foo_minsize( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] ; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5 ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; entry: br label %for.body for.body: ; preds = %for.body, %entry %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 %0 = load i8, i8* %arrayidx, align 1 %cmp1 = icmp eq i8 %0, 0 %. = select i1 %cmp1, i8 2, i8 1 store i8 %., i8* %arrayidx, align 1 %inc = add nsw i32 %i.08, 1 %exitcond = icmp eq i32 %i.08, 202 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body ret i32 0 } attributes #1 = { minsize } ; We can't vectorize this one because we version for stride==1; even having TC ; a multiple of VF. ; CHECK-LABEL: @scev4stride1 ; CHECK-NOT: vector.scevcheck ; CHECK-NOT: vector.body: ; CHECK-LABEL: for.body: ; AUTOVF-LABEL: @scev4stride1 ; AUTOVF-NOT: vector.scevcheck ; AUTOVF-NOT: vector.body: ; AUTOVF-LABEL: for.body: define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { for.body.preheader: br label %for.body for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %mul = mul nsw i32 %i.07, %k %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul %0 = load i32, i32* %arrayidx, align 4 %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 store i32 %0, i32* %arrayidx1, align 4 %inc = add nuw nsw i32 %i.07, 1 %exitcond = icmp eq i32 %inc, 256 br i1 %exitcond, label %for.end.loopexit, label %for.body for.end.loopexit: ; preds = %for.body ret void } attributes #2 = { optsize } ; PR39497 ; We can't vectorize this one because we version for overflow check and tiny ; trip count leads to opt-for-size (which otherwise could fold the tail by ; masking). ; CHECK-LABEL: @main ; CHECK-NOT: vector.scevcheck ; CHECK-NOT: vector.body: ; CHECK-LABEL: for.cond: ; AUTOVF-LABEL: @main ; AUTOVF-NOT: vector.scevcheck ; AUTOVF-NOT: vector.body: ; AUTOVF-LABEL: for.cond: define i32 @main() local_unnamed_addr { while.cond: br label %for.cond for.cond: %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ] %conv = and i32 %d.0, 65535 %cmp = icmp ult i32 %conv, 4 %add = add nuw nsw i32 %conv, 1 br i1 %cmp, label %for.cond, label %while.cond.loopexit while.cond.loopexit: ret i32 0 }