diff options
Diffstat (limited to 'test/Transforms')
48 files changed, 4907 insertions, 1099 deletions
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll index 4b9e7c3956f58..1dfc087619653 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -23,9 +23,63 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp3( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]] +; X32-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X32-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X32-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp3( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X64-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) ret i32 %call @@ -50,27 +104,225 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp5( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X32-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X32-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp5( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X64-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) ret i32 %call } define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp6( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; X32-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; X32-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp6( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]]) +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20]] = zext i16 [[TMP18]] to i64 +; X64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]] +; X64-NEXT: br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) ret i32 %call } define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp7( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; X32-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; X32-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X32-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp7( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]]) +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20]] = zext i16 [[TMP18]] to i64 +; X64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]] +; X64-NEXT: br i1 [[TMP21]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP22:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X64-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = zext i8 [[TMP24]] to i32 +; X64-NEXT: [[TMP27:%.*]] = zext i8 [[TMP25]] to i32 +; X64-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP27]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP28]], [[LOADBB2]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) ret i32 %call @@ -78,8 +330,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) -; X32-NEXT: ret i32 [[CALL]] +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] ; ; X64-LABEL: @cmp8( ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* @@ -99,72 +378,691 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp9( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X32-NEXT: [[TMP20:%.*]] = load i8, i8* [[TMP18]] +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = zext i8 [[TMP20]] to i32 +; X32-NEXT: [[TMP23:%.*]] = zext i8 [[TMP21]] to i32 +; X32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP22]], [[TMP23]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP24]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) ret i32 %call } define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp10( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4 +; X32-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]]) +; X32-NEXT: [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]]) +; X32-NEXT: [[TMP26]] = zext i16 [[TMP24]] to i32 +; X32-NEXT: [[TMP27]] = zext i16 [[TMP25]] to i32 +; X32-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]] +; X32-NEXT: br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) ret i32 %call } define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp11( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4 +; X32-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]]) +; X32-NEXT: [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]]) +; X32-NEXT: [[TMP26]] = zext i16 [[TMP24]] to i32 +; X32-NEXT: [[TMP27]] = zext i16 [[TMP25]] to i32 +; X32-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]] +; X32-NEXT: br i1 [[TMP28]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP29:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X32-NEXT: [[TMP31:%.*]] = load i8, i8* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i8, i8* [[TMP30]] +; X32-NEXT: [[TMP33:%.*]] = zext i8 [[TMP31]] to i32 +; X32-NEXT: [[TMP34:%.*]] = zext i8 [[TMP32]] to i32 +; X32-NEXT: [[TMP35:%.*]] = sub i32 [[TMP33]], [[TMP34]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP35]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp11( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X64-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X64-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X64-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X64-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) ret i32 %call } define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp12( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) ret i32 %call } define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp13( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X32-NEXT: [[TMP28:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X32-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP27]] +; X32-NEXT: [[TMP30:%.*]] = load i8, i8* [[TMP28]] +; X32-NEXT: [[TMP31:%.*]] = zext i8 [[TMP29]] to i32 +; X32-NEXT: [[TMP32:%.*]] = zext i8 [[TMP30]] to i32 +; X32-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP33]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp13( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X64-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X64-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X64-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X64-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) ret i32 %call } define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp14( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP35:%.*]], [[LOADBB3:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP36:%.*]], [[LOADBB3]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP28:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP29:%.*]] = getelementptr i16, i16* [[TMP27]], i16 6 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i16, i16* [[TMP28]], i16 6 +; X32-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP30]] +; X32-NEXT: [[TMP33:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP31]]) +; X32-NEXT: [[TMP34:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP32]]) +; X32-NEXT: [[TMP35]] = zext i16 [[TMP33]] to i32 +; X32-NEXT: [[TMP36]] = zext i16 [[TMP34]] to i32 +; X32-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP35]], [[TMP36]] +; X32-NEXT: br i1 [[TMP37]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp14( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP21:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6 +; X64-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]]) +; X64-NEXT: [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]]) +; X64-NEXT: [[TMP28]] = zext i16 [[TMP26]] to i64 +; X64-NEXT: [[TMP29]] = zext i16 [[TMP27]] to i64 +; X64-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]] +; X64-NEXT: br i1 [[TMP30]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) ret i32 %call } define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp15( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP21:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6 +; X64-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]]) +; X64-NEXT: [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]]) +; X64-NEXT: [[TMP28]] = zext i16 [[TMP26]] to i64 +; X64-NEXT: [[TMP29]] = zext i16 [[TMP27]] to i64 +; X64-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]] +; X64-NEXT: br i1 [[TMP30]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X64: loadbb3: +; X64-NEXT: [[TMP31:%.*]] = getelementptr i8, i8* [[X]], i8 14 +; X64-NEXT: [[TMP32:%.*]] = getelementptr i8, i8* [[Y]], i8 14 +; X64-NEXT: [[TMP33:%.*]] = load i8, i8* [[TMP31]] +; X64-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP32]] +; X64-NEXT: [[TMP35:%.*]] = zext i8 [[TMP33]] to i32 +; X64-NEXT: [[TMP36:%.*]] = zext i8 [[TMP34]] to i32 +; X64-NEXT: [[TMP37:%.*]] = sub i32 [[TMP35]], [[TMP36]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP37]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) ret i32 %call } define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp16( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP33:%.*]], [[LOADBB3:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP34:%.*]], [[LOADBB3]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP28:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP27]], i32 3 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP28]], i32 3 +; X32-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP30]] +; X32-NEXT: [[TMP33]] = call i32 @llvm.bswap.i32(i32 [[TMP31]]) +; X32-NEXT: [[TMP34]] = call i32 @llvm.bswap.i32(i32 [[TMP32]]) +; X32-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], [[TMP34]] +; X32-NEXT: br i1 [[TMP35]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) +; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] +; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) ret i32 %call @@ -190,8 +1088,25 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -221,8 +1136,25 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -234,8 +1166,27 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -247,8 +1198,34 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; ALL: loadbb2: +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; ALL-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; ALL-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; ALL-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; ALL-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -260,8 +1237,27 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; @@ -283,11 +1279,60 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq9( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X32-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X32-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X32-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X32-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) %cmp = icmp eq i32 %call, 0 @@ -296,11 +1341,64 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq10( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4 +; X32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) %cmp = icmp eq i32 %call, 0 @@ -309,11 +1407,78 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq11( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4 +; X32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X32-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq11( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X64-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) %cmp = icmp eq i32 %call, 0 @@ -322,11 +1487,64 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq12( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) %cmp = icmp eq i32 %call, 0 @@ -335,11 +1553,78 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq13( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X32-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq13( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X64-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) %cmp = icmp eq i32 %call, 0 @@ -348,11 +1633,82 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq14( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP20:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 6 +; X32-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X32-NEXT: [[TMP25:%.*]] = icmp ne i16 [[TMP23]], [[TMP24]] +; X32-NEXT: br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq14( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6 +; X64-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6 +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X64-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) %cmp = icmp eq i32 %call, 0 @@ -361,11 +1717,52 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq15( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6 +; X64-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6 +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X64-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X64: loadbb3: +; X64-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 14 +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 14 +; X64-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X64-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) %cmp = icmp eq i32 %call, 0 @@ -374,11 +1771,73 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq16( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP20:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 3 +; X32-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[TMP20]], i32 3 +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP22]] +; X32-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP23]], [[TMP24]] +; X32-NEXT: br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1 +; X64-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) %cmp = icmp eq i32 %call, 0 diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll index b6b7757978263..088b177c2e11a 100644 --- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll +++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll @@ -194,7 +194,6 @@ rare.2: br label %fallthrough } - declare void @slowpath(i32, i32*) ; Make sure we don't end up in an infinite loop after we fail to sink. @@ -218,3 +217,37 @@ load.i145: pl_loop.i.i122: br label %pl_loop.i.i122 } + +; Make sure we can sink address computation even +; if there is a cycle in phi nodes. +define void @test9(i1 %cond, i64* %base) { +; CHECK-LABEL: @test9 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br label %header + +header: + %iv = phi i32 [0, %entry], [%iv.inc, %backedge] + %casted.loop = phi i32* [%casted, %entry], [%casted.merged, %backedge] + br i1 %cond, label %if.then, label %backedge + +if.then: + call void @foo(i32 %iv) + %addr.1 = getelementptr inbounds i64, i64* %base, i64 5 + %casted.1 = bitcast i64* %addr.1 to i32* + br label %backedge + +backedge: +; CHECK-LABEL: backedge: +; CHECK: getelementptr i8, {{.+}} 40 + %casted.merged = phi i32* [%casted.loop, %header], [%casted.1, %if.then] + %v = load i32, i32* %casted.merged, align 4 + call void @foo(i32 %v) + %iv.inc = add i32 %iv, 1 + %cmp = icmp slt i32 %iv.inc, 1000 + br i1 %cmp, label %header, label %exit + +exit: + ret void +} diff --git a/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll new file mode 100644 index 0000000000000..57dbdd8831902 --- /dev/null +++ b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -globals-aa -early-cse-memssa | FileCheck %s + +define i16 @f1() readonly { + ret i16 0 +} + +declare void @f2() + +; Check that EarlyCSE correctly handles function calls that don't have +; a MemoryAccess. In this case the calls to @f1 have no +; MemoryAccesses since globals-aa determines that @f1 doesn't +; read/write memory at all. + +define void @f3() { +; CHECK-LABEL: @f3( +; CHECK-NEXT: [[CALL1:%.*]] = call i16 @f1() +; CHECK-NEXT: call void @f2() +; CHECK-NEXT: ret void +; + %call1 = call i16 @f1() + call void @f2() + %call2 = call i16 @f1() + ret void +} diff --git a/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll new file mode 100644 index 0000000000000..513379d0bd017 --- /dev/null +++ b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll @@ -0,0 +1,79 @@ +; This test checks if debug loc is propagated to load/store created by GVN/Instcombine. +; RUN: opt < %s -gvn -S | FileCheck %s --check-prefixes=ALL,GVN +; RUN: opt < %s -gvn -instcombine -S | FileCheck %s --check-prefixes=ALL,INSTCOMBINE + +; struct node { +; int *v; +; struct desc *descs; +; }; + +; struct desc { +; struct node *node; +; }; + +; extern int bar(void *v, void* n); + +; int test(struct desc *desc) +; { +; void *v, *n; +; v = !desc ? ((void *)0) : desc->node->v; // Line 15 +; n = &desc->node->descs[0]; // Line 16 +; return bar(v, n); +; } + +; Line 16, Column 13: +; n = &desc->node->descs[0]; +; ^ + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +%struct.desc = type { %struct.node* } +%struct.node = type { i32*, %struct.desc* } + +define i32 @test(%struct.desc* readonly %desc) local_unnamed_addr #0 !dbg !4 { +entry: + %tobool = icmp eq %struct.desc* %desc, null + br i1 %tobool, label %cond.end, label %cond.false, !dbg !9 +; ALL: br i1 %tobool, label %entry.cond.end_crit_edge, label %cond.false, !dbg [[LOC_15_6:![0-9]+]] +; ALL: entry.cond.end_crit_edge: +; GVN: %.pre = load %struct.node*, %struct.node** null, align 8, !dbg [[LOC_16_13:![0-9]+]] +; INSTCOMBINE:store %struct.node* undef, %struct.node** null, align 536870912, !dbg [[LOC_16_13:![0-9]+]] + +cond.false: + %0 = bitcast %struct.desc* %desc to i8***, !dbg !11 + %1 = load i8**, i8*** %0, align 8, !dbg !11 + %2 = load i8*, i8** %1, align 8 + br label %cond.end, !dbg !9 + +cond.end: + %3 = phi i8* [ %2, %cond.false ], [ null, %entry ], !dbg !9 + %node2 = getelementptr inbounds %struct.desc, %struct.desc* %desc, i64 0, i32 0 + %4 = load %struct.node*, %struct.node** %node2, align 8, !dbg !10 + %descs = getelementptr inbounds %struct.node, %struct.node* %4, i64 0, i32 1 + %5 = bitcast %struct.desc** %descs to i8** + %6 = load i8*, i8** %5, align 8 + %call = tail call i32 @bar(i8* %3, i8* %6) + ret i32 %call +} + +declare i32 @bar(i8*, i8*) local_unnamed_addr #1 +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug) +!1 = !DIFile(filename: "test.c", directory: ".") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !5, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8) +!5 = !DISubroutineType(types: !6) +!6 = !{!7} +!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!8 = !{} +!9 = !DILocation(line: 15, column: 6, scope: !4) +!10 = !DILocation(line: 16, column: 13, scope: !4) +!11 = !DILocation(line: 15, column: 34, scope: !4) + +;ALL: [[SCOPE:![0-9]+]] = distinct !DISubprogram(name: "test",{{.*}} +;ALL: [[LOC_15_6]] = !DILocation(line: 15, column: 6, scope: [[SCOPE]]) +;ALL: [[LOC_16_13]] = !DILocation(line: 16, column: 13, scope: [[SCOPE]]) diff --git a/test/Transforms/GVN/PRE/phi-translate.ll b/test/Transforms/GVN/PRE/phi-translate.ll index 1f6c7c8d33ea7..55f5fd6465b68 100644 --- a/test/Transforms/GVN/PRE/phi-translate.ll +++ b/test/Transforms/GVN/PRE/phi-translate.ll @@ -6,12 +6,12 @@ target datalayout = "e-p:64:64:64" ; CHECK: entry.end_crit_edge: ; CHECK: %[[INDEX:[a-z0-9.]+]] = sext i32 %x to i64{{$}} ; CHECK: %[[ADDRESS:[a-z0-9.]+]] = getelementptr [100 x i32], [100 x i32]* @G, i64 0, i64 %[[INDEX]]{{$}} -; CHECK: %n.pre = load i32, i32* %[[ADDRESS]]{{$}} +; CHECK: %n.pre = load i32, i32* %[[ADDRESS]], !dbg [[N_LOC:![0-9]+]] ; CHECK: br label %end ; CHECK: then: ; CHECK: store i32 %z ; CHECK: end: -; CHECK: %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC:![0-9]+]] +; CHECK: %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC]] ; CHECK: ret i32 %n ; CHECK: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}}) diff --git a/test/Transforms/GlobalOpt/pr33686.ll b/test/Transforms/GlobalOpt/pr33686.ll new file mode 100644 index 0000000000000..d6bb98735f4e8 --- /dev/null +++ b/test/Transforms/GlobalOpt/pr33686.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -globalopt %s | FileCheck %s + +@glob = external global i16, align 1 + +define void @beth() { +; CHECK-LABEL: @beth( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void + +notreachable: + %patatino = select i1 undef, i16* @glob, i16* %patatino + br label %notreachable +} diff --git a/test/Transforms/IRCE/eq_ne.ll b/test/Transforms/IRCE/eq_ne.ll new file mode 100644 index 0000000000000..1b1ffe6b94ba7 --- /dev/null +++ b/test/Transforms/IRCE/eq_ne.ll @@ -0,0 +1,257 @@ +; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s + +; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK-NOT: irce: in function test_02: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK: irce: in function test_03: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK-NOT: irce: in function test_04: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK: irce: in function test_05: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK-NOT: irce: in function test_06: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK: irce: in function test_07: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK-NOT: irce: in function test_08: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> + +; Show that IRCE can turn 'ne' condition to 'slt' in increasing IV. +define void @test_01(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_01 +; CHECK: main.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, 100 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that if n is not known to be greater than the starting value, IRCE +; doesn't apply. +define void @test_02(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_02( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, -100 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE can turn 'eq' condition to 'sge' in increasing IV. +define void @test_03(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_03( +; CHECK: main.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, 100 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that if n is not known to be greater than the starting value, IRCE +; doesn't apply. +define void @test_04(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_04( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, -100 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE can turn 'ne' condition to 'sgt' in decreasing IV. +define void @test_05(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_05( +; CHECK: preloop.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, 0 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE cannot turn 'ne' condition to 'sgt' in decreasing IV if the end +; value is not proved to be less than the start value. +define void @test_06(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_06( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp ne i32 %idx.next, 120 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE can turn 'eq' condition to 'slt' in decreasing IV. +define void @test_07(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_07( +; CHECK: preloop.exit.selector: +; CHECK-NEXT: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ] +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0 +; CHECK-NEXT: br i1 [[COND]] + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, 0 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +; Show that IRCE cannot turn 'eq' condition to 'slt' in decreasing IV if the end +; value is not proved to be less than the start value. +define void @test_08(i32* %arr, i32* %a_len_ptr) #0 { + +; CHECK: test_08( + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1 + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp eq i32 %idx.next, 120 + br i1 %next, label %exit, label %loop + +out.of.bounds: + ret void + +exit: + ret void +} + +!0 = !{i32 0, i32 50} +!1 = !{!"branch_weights", i32 64, i32 4} diff --git a/test/Transforms/IRCE/pre_post_loops.ll b/test/Transforms/IRCE/pre_post_loops.ll new file mode 100644 index 0000000000000..2cd2e29104fe9 --- /dev/null +++ b/test/Transforms/IRCE/pre_post_loops.ll @@ -0,0 +1,117 @@ +; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s + +; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> +; CHECK: irce: in function test_02: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting> + +; Iterate from 0 to SINT_MAX, check that the post-loop is generated. +define void @test_01(i32* %arr, i32* %a_len_ptr) { + +; CHECK: test_01( +; CHECK: entry: +; CHECK-NEXT: %exit.mainloop.at = load i32, i32* %a_len_ptr +; CHECK: loop: +; CHECK-NEXT: %idx = phi i32 [ %idx.next, %in.bounds ], [ 0, %loop.preheader ] +; CHECK-NEXT: %idx.next = add i32 %idx, 1 +; CHECK-NEXT: %abc = icmp slt i32 %idx, %exit.mainloop.at +; CHECK-NEXT: br i1 true, label %in.bounds, +; CHECK: in.bounds: +; CHECK-NEXT: %addr = getelementptr i32, i32* %arr, i32 %idx +; CHECK-NEXT: store i32 0, i32* %addr +; CHECK-NEXT: %next = icmp slt i32 %idx.next, 2147483647 +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp slt i32 %idx.next, %exit.mainloop.at +; CHECK-NEXT: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: main.pseudo.exit: +; CHECK-NEXT: %idx.copy = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ] +; CHECK-NEXT: %indvar.end = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ] +; CHECK-NEXT: br label %postloop +; CHECK: postloop: +; CHECK-NEXT: br label %loop.postloop +; CHECK: loop.postloop: +; CHECK-NEXT: %idx.postloop = phi i32 [ %idx.copy, %postloop ], [ %idx.next.postloop, %in.bounds.postloop ] +; CHECK-NEXT: %idx.next.postloop = add i32 %idx.postloop, 1 +; CHECK-NEXT: %abc.postloop = icmp slt i32 %idx.postloop, %exit.mainloop.at +; CHECK-NEXT: br i1 %abc.postloop, label %in.bounds.postloop, label %out.of.bounds.loopexit +; CHECK: in.bounds.postloop: +; CHECK-NEXT: %addr.postloop = getelementptr i32, i32* %arr, i32 %idx.postloop +; CHECK-NEXT: store i32 0, i32* %addr.postloop +; CHECK-NEXT: %next.postloop = icmp slt i32 %idx.next.postloop, 2147483647 +; CHECK-NEXT: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, 1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp slt i32 %idx.next, 2147483647 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +; Iterate from SINT_MAX to 0, check that the pre-loop is generated. +define void @test_02(i32* %arr, i32* %a_len_ptr) { + +; CHECK: test_02( +; CHECK: entry: +; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +; CHECH-NEXT: br i1 true, label %loop.preloop.preheader +; CHECK: mainloop: +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: %idx = phi i32 [ %idx.preloop.copy, %mainloop ], [ %idx.next, %in.bounds ] +; CHECK-NEXT: %idx.next = add i32 %idx, -1 +; CHECK-NEXT: %abc = icmp slt i32 %idx, %len +; CHECK-NEXT: br i1 true, label %in.bounds +; CHECK: in.bounds: +; CHECK-NEXT: %addr = getelementptr i32, i32* %arr, i32 %idx +; CHECK-NEXT: store i32 0, i32* %addr +; CHECK-NEXT: %next = icmp sgt i32 %idx.next, -1 +; CHECK-NEXT: br i1 %next, label %loop, label %exit.loopexit +; CHECK: loop.preloop: +; CHECK-NEXT: %idx.preloop = phi i32 [ %idx.next.preloop, %in.bounds.preloop ], [ 2147483647, %loop.preloop.preheader ] +; CHECK-NEXT: %idx.next.preloop = add i32 %idx.preloop, -1 +; CHECK-NEXT: %abc.preloop = icmp slt i32 %idx.preloop, %len +; CHECK-NEXT: br i1 %abc.preloop, label %in.bounds.preloop, label %out.of.bounds.loopexit +; CHECK: in.bounds.preloop: +; CHECK-NEXT: %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop +; CHECK-NEXT: store i32 0, i32* %addr.preloop +; CHECK-NEXT: %next.preloop = icmp sgt i32 %idx.next.preloop, -1 +; CHECK-NEXT: [[COND:%[^ ]+]] = icmp sgt i32 %idx.next.preloop, -1 +; CHECK-NEXT: br i1 [[COND]], label %loop.preloop, label %preloop.exit.selector + +entry: + %len = load i32, i32* %a_len_ptr, !range !0 + br label %loop + +loop: + %idx = phi i32 [ 2147483647, %entry ], [ %idx.next, %in.bounds ] + %idx.next = add i32 %idx, -1 + %abc = icmp slt i32 %idx, %len + br i1 %abc, label %in.bounds, label %out.of.bounds + +in.bounds: + %addr = getelementptr i32, i32* %arr, i32 %idx + store i32 0, i32* %addr + %next = icmp sgt i32 %idx.next, -1 + br i1 %next, label %loop, label %exit + +out.of.bounds: + ret void + +exit: + ret void +} + +!0 = !{i32 0, i32 50} diff --git a/test/Transforms/Inline/AArch64/ext.ll b/test/Transforms/Inline/AArch64/ext.ll new file mode 100644 index 0000000000000..04095c04ee869 --- /dev/null +++ b/test/Transforms/Inline/AArch64/ext.ll @@ -0,0 +1,249 @@ +; REQUIRES: asserts +; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define i32 @outer1(i32* %ptr, i32 %i) { + %C = call i32 @inner1(i32* %ptr, i32 %i) + ret i32 %C +} + +; sext can be folded into gep. +; CHECK: Analyzing call of inner1 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner1(i32* %ptr, i32 %i) { + %E = sext i32 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i32 @outer2(i32* %ptr, i32 %i) { + %C = call i32 @inner2(i32* %ptr, i32 %i) + ret i32 %C +} + +; zext from i32 to i64 is free. +; CHECK: Analyzing call of inner2 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner2(i32* %ptr, i32 %i) { + %E = zext i32 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i32 @outer3(i32* %ptr, i16 %i) { + %C = call i32 @inner3(i32* %ptr, i16 %i) + ret i32 %C +} + +; zext can be folded into gep. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner3(i32* %ptr, i16 %i) { + %E = zext i16 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i16 @outer4(i8* %ptr) { + %C = call i16 @inner4(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner4 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner4(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i16 + ret i16 %E +} + +define i16 @outer5(i8* %ptr) { + %C = call i16 @inner5(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner5 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner5(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i16 + ret i16 %E +} + +define i32 @outer6(i8* %ptr) { + %C = call i32 @inner6(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner6 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner6(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i32 + ret i32 %E +} + +define i32 @outer7(i8* %ptr) { + %C = call i32 @inner7(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner7 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner7(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i32 + ret i32 %E +} + +define i32 @outer8(i16* %ptr) { + %C = call i32 @inner8(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner8 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner8(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i32 + ret i32 %E +} + +define i32 @outer9(i16* %ptr) { + %C = call i32 @inner9(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner9 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner9(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i32 + ret i32 %E +} + +define i64 @outer10(i8* %ptr) { + %C = call i64 @inner10(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner10 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner10(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i64 + ret i64 %E +} + +define i64 @outer11(i8* %ptr) { + %C = call i64 @inner11(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner11 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner11(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i64 + ret i64 %E +} + +define i64 @outer12(i16* %ptr) { + %C = call i64 @inner12(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner12 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner12(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i64 + ret i64 %E +} + +define i64 @outer13(i16* %ptr) { + %C = call i64 @inner13(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner13 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner13(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i64 + ret i64 %E +} + +define i64 @outer14(i32* %ptr) { + %C = call i64 @inner14(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner14 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner14(i32* %ptr) { + %L = load i32, i32* %ptr + %E = zext i32 %L to i64 + ret i64 %E +} + +define i64 @outer15(i32* %ptr) { + %C = call i64 @inner15(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner15 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner15(i32* %ptr) { + %L = load i32, i32* %ptr + %E = sext i32 %L to i64 + ret i64 %E +} + +define i64 @outer16(i32 %V1, i64 %V2) { + %C = call i64 @inner16(i32 %V1, i64 %V2) + ret i64 %C +} + +; sext can be folded into shl. +; CHECK: Analyzing call of inner16 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 4 +define i64 @inner16(i32 %V1, i64 %V2) { + %E = sext i32 %V1 to i64 + %S = shl i64 %E, 3 + %A = add i64 %V2, %S + ret i64 %A +} diff --git a/test/Transforms/Inline/PowerPC/ext.ll b/test/Transforms/Inline/PowerPC/ext.ll new file mode 100644 index 0000000000000..f7a409467b2c0 --- /dev/null +++ b/test/Transforms/Inline/PowerPC/ext.ll @@ -0,0 +1,140 @@ +; REQUIRES: asserts +; RUN: opt -inline -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64le-ibm-linux-gnu" + +define i16 @outer1(i8* %ptr) { + %C = call i16 @inner1(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner1 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner1(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i16 + ret i16 %E +} + +define i32 @outer2(i8* %ptr) { + %C = call i32 @inner2(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner2 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner2(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i32 + ret i32 %E +} + +define i32 @outer3(i16* %ptr) { + %C = call i32 @inner3(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner3(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i32 + ret i32 %E +} + +define i32 @outer4(i16* %ptr) { + %C = call i32 @inner4(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner4 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner4(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i32 + ret i32 %E +} + +define i64 @outer5(i8* %ptr) { + %C = call i64 @inner5(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner5 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner5(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i64 + ret i64 %E +} + +define i64 @outer6(i16* %ptr) { + %C = call i64 @inner6(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner6 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner6(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i64 + ret i64 %E +} + +define i64 @outer7(i16* %ptr) { + %C = call i64 @inner7(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner7 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner7(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i64 + ret i64 %E +} + +define i64 @outer8(i32* %ptr) { + %C = call i64 @inner8(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner8 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner8(i32* %ptr) { + %L = load i32, i32* %ptr + %E = zext i32 %L to i64 + ret i64 %E +} + +define i64 @outer9(i32* %ptr) { + %C = call i64 @inner9(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner9 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner9(i32* %ptr) { + %L = load i32, i32* %ptr + %E = sext i32 %L to i64 + ret i64 %E +} diff --git a/test/Transforms/Inline/PowerPC/lit.local.cfg b/test/Transforms/Inline/PowerPC/lit.local.cfg new file mode 100644 index 0000000000000..5d33887ff0a48 --- /dev/null +++ b/test/Transforms/Inline/PowerPC/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'PowerPC' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/Inline/X86/ext.ll b/test/Transforms/Inline/X86/ext.ll new file mode 100644 index 0000000000000..bffda38527998 --- /dev/null +++ b/test/Transforms/Inline/X86/ext.ll @@ -0,0 +1,201 @@ +; REQUIRES: asserts +; RUN: opt -inline -mtriple=x86_64-unknown-unknown -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +define i32 @outer1(i32* %ptr, i32 %i) { + %C = call i32 @inner1(i32* %ptr, i32 %i) + ret i32 %C +} + +; zext from i32 to i64 is free. +; CHECK: Analyzing call of inner1 +; CHECK: NumInstructionsSimplified: 3 +; CHECK: NumInstructions: 4 +define i32 @inner1(i32* %ptr, i32 %i) { + %E = zext i32 %i to i64 + %G = getelementptr inbounds i32, i32* %ptr, i64 %E + %L = load i32, i32* %G + ret i32 %L +} + +define i16 @outer2(i8* %ptr) { + %C = call i16 @inner2(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner2 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner2(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i16 + ret i16 %E +} + +define i16 @outer3(i8* %ptr) { + %C = call i16 @inner3(i8* %ptr) + ret i16 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i16 @inner3(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i16 + ret i16 %E +} + +define i32 @outer4(i8* %ptr) { + %C = call i32 @inner4(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner4 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner4(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i32 + ret i32 %E +} + +define i32 @outer5(i8* %ptr) { + %C = call i32 @inner5(i8* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner5 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner5(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i32 + ret i32 %E +} + +define i32 @outer6(i16* %ptr) { + %C = call i32 @inner6(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner6 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner6(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i32 + ret i32 %E +} + +define i32 @outer7(i16* %ptr) { + %C = call i32 @inner7(i16* %ptr) + ret i32 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner7 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i32 @inner7(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i32 + ret i32 %E +} + +define i64 @outer8(i8* %ptr) { + %C = call i64 @inner8(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner8 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner8(i8* %ptr) { + %L = load i8, i8* %ptr + %E = zext i8 %L to i64 + ret i64 %E +} + +define i64 @outer9(i8* %ptr) { + %C = call i64 @inner9(i8* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner9 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner9(i8* %ptr) { + %L = load i8, i8* %ptr + %E = sext i8 %L to i64 + ret i64 %E +} + +define i64 @outer10(i16* %ptr) { + %C = call i64 @inner10(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner10 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner10(i16* %ptr) { + %L = load i16, i16* %ptr + %E = zext i16 %L to i64 + ret i64 %E +} + +define i64 @outer11(i16* %ptr) { + %C = call i64 @inner11(i16* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner11 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner11(i16* %ptr) { + %L = load i16, i16* %ptr + %E = sext i16 %L to i64 + ret i64 %E +} + +define i64 @outer12(i32* %ptr) { + %C = call i64 @inner12(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner12 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner12(i32* %ptr) { + %L = load i32, i32* %ptr + %E = zext i32 %L to i64 + ret i64 %E +} + +define i64 @outer13(i32* %ptr) { + %C = call i64 @inner13(i32* %ptr) + ret i64 %C +} + +; It is an ExtLoad. +; CHECK: Analyzing call of inner13 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 3 +define i64 @inner13(i32* %ptr) { + %L = load i32, i32* %ptr + %E = sext i32 %L to i64 + ret i64 %E +} diff --git a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll index 3c4e08b5b515c..9053578175094 100644 --- a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll +++ b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll @@ -1,7 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -; CHECK: llvm.umul.with.overflow define i32 @sterix(i32, i8, i64) { +; CHECK-LABEL: @sterix( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP0:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i8 [[TMP1:%.*]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV1]], 1945964878 +; CHECK-NEXT: [[SH_PROM:%.*]] = trunc i64 [[TMP2:%.*]] to i32 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], [[SH_PROM]] +; CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[SHR]] to i64 +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw nsw i64 [[CONV]], [[CONV2]] +; CHECK-NEXT: [[CONV6:%.*]] = and i64 [[MUL3]], 4294967295 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[CONV6]], [[MUL3]] +; CHECK-NEXT: br i1 [[TOBOOL]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]] +; CHECK: lor.rhs: +; CHECK-NEXT: [[AND:%.*]] = and i64 [[MUL3]], [[TMP2]] +; CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[AND]] to i32 +; CHECK-NEXT: [[TOBOOL7:%.*]] = icmp eq i32 [[CONV4]], 0 +; CHECK-NEXT: [[PHITMP:%.*]] = zext i1 [[TOBOOL7]] to i32 +; CHECK-NEXT: br label [[LOR_END]] +; CHECK: lor.end: +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHITMP]], [[LOR_RHS]] ] +; CHECK-NEXT: ret i32 [[TMP3]] +; entry: %conv = zext i32 %0 to i64 %conv1 = sext i8 %1 to i32 diff --git a/test/Transforms/InstCombine/and-not-or.ll b/test/Transforms/InstCombine/and-not-or.ll deleted file mode 100644 index a42140be28052..0000000000000 --- a/test/Transforms/InstCombine/and-not-or.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: opt < %s -instcombine -S | grep "and i32 %x, %y" | count 4 -; RUN: opt < %s -instcombine -S | not grep "or" - -define i32 @func1(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %n, %x - %a = and i32 %o, %y - ret i32 %a -} - -define i32 @func2(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %x, %n - %a = and i32 %o, %y - ret i32 %a -} - -define i32 @func3(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %n, %x - %a = and i32 %y, %o - ret i32 %a -} - -define i32 @func4(i32 %x, i32 %y) nounwind { -entry: - %n = xor i32 %y, -1 - %o = or i32 %x, %n - %a = and i32 %y, %o - ret i32 %a -} diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll index 7bb9b95b31797..c12662d4db0e1 100644 --- a/test/Transforms/InstCombine/and.ll +++ b/test/Transforms/InstCombine/and.ll @@ -628,3 +628,195 @@ define i32 @test43(i32 %a, i32 %c, i32 %d) { %and = and i32 %or, %xor ret i32 %and } + +; (~y | x) & y -> x & y +define i32 @test44(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test44( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %n, %x + %a = and i32 %o, %y + ret i32 %a +} + +; (x | ~y) & y -> x & y +define i32 @test45(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test45( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %x, %n + %a = and i32 %o, %y + ret i32 %a +} + +; y & (~y | x) -> y | x +define i32 @test46(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test46( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %n, %x + %a = and i32 %y, %o + ret i32 %a +} + +; y & (x | ~y) -> y | x +define i32 @test47(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: @test47( +; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[A]] +; + %n = xor i32 %y, -1 + %o = or i32 %x, %n + %a = and i32 %y, %o + ret i32 %a +} + +; In the next 4 tests, vary the types and predicates for extra coverage. +; (X & (Y | ~X)) -> (X & Y), where 'not' is an inverted cmp + +define i1 @and_orn_cmp_1(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @and_orn_cmp_1( +; CHECK-NEXT: [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp sgt i32 %a, %b + %x_inv = icmp sle i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %or = or i1 %y, %x_inv + %and = and i1 %x, %or + ret i1 %and +} + +; Commute the 'and': +; ((Y | ~X) & X) -> (X & Y), where 'not' is an inverted cmp + +define <2 x i1> @and_orn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: @and_orn_cmp_2( +; CHECK-NEXT: [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], <i32 42, i32 47> +; CHECK-NEXT: [[AND:%.*]] = and <2 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[AND]] +; + %x = icmp sge <2 x i32> %a, %b + %x_inv = icmp slt <2 x i32> %a, %b + %y = icmp ugt <2 x i32> %c, <i32 42, i32 47> ; thwart complexity-based ordering + %or = or <2 x i1> %y, %x_inv + %and = and <2 x i1> %or, %x + ret <2 x i1> %and +} + +; Commute the 'or': +; (X & (~X | Y)) -> (X & Y), where 'not' is an inverted cmp + +define i1 @and_orn_cmp_3(i72 %a, i72 %b, i72 %c) { +; CHECK-LABEL: @and_orn_cmp_3( +; CHECK-NEXT: [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp ugt i72 %a, %b + %x_inv = icmp ule i72 %a, %b + %y = icmp ugt i72 %c, 42 ; thwart complexity-based ordering + %or = or i1 %x_inv, %y + %and = and i1 %x, %or + ret i1 %and +} + +; Commute the 'and': +; ((~X | Y) & X) -> (X & Y), where 'not' is an inverted cmp + +define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) { +; CHECK-LABEL: @or_andn_cmp_4( +; CHECK-NEXT: [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], <i32 42, i32 43, i32 -1> +; CHECK-NEXT: [[AND:%.*]] = and <3 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <3 x i1> [[AND]] +; + %x = icmp eq <3 x i32> %a, %b + %x_inv = icmp ne <3 x i32> %a, %b + %y = icmp ugt <3 x i32> %c, <i32 42, i32 43, i32 -1> ; thwart complexity-based ordering + %or = or <3 x i1> %x_inv, %y + %and = and <3 x i1> %or, %x + ret <3 x i1> %and +} + +; In the next 4 tests, vary the types and predicates for extra coverage. +; (~X & (Y | X)) -> (~X & Y), where 'not' is an inverted cmp + +define i1 @andn_or_cmp_1(i37 %a, i37 %b, i37 %c) { +; CHECK-LABEL: @andn_or_cmp_1( +; CHECK-NEXT: [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[X_INV]], [[Y]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp sgt i37 %a, %b + %x_inv = icmp sle i37 %a, %b + %y = icmp ugt i37 %c, 42 ; thwart complexity-based ordering + %or = or i1 %y, %x + %and = and i1 %x_inv, %or + ret i1 %and +} + +; Commute the 'and': +; ((Y | X) & ~X) -> (~X & Y), where 'not' is an inverted cmp + +define i1 @andn_or_cmp_2(i16 %a, i16 %b, i16 %c) { +; CHECK-LABEL: @andn_or_cmp_2( +; CHECK-NEXT: [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp sge i16 %a, %b + %x_inv = icmp slt i16 %a, %b + %y = icmp ugt i16 %c, 42 ; thwart complexity-based ordering + %or = or i1 %y, %x + %and = and i1 %or, %x_inv + ret i1 %and +} + +; Commute the 'or': +; (~X & (X | Y)) -> (~X & Y), where 'not' is an inverted cmp + +define <4 x i1> @andn_or_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: @andn_or_cmp_3( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], <i32 42, i32 0, i32 1, i32 -1> +; CHECK-NEXT: [[AND:%.*]] = and <4 x i1> [[X_INV]], [[Y]] +; CHECK-NEXT: ret <4 x i1> [[AND]] +; + %x = icmp ugt <4 x i32> %a, %b + %x_inv = icmp ule <4 x i32> %a, %b + %y = icmp ugt <4 x i32> %c, <i32 42, i32 0, i32 1, i32 -1> ; thwart complexity-based ordering + %or = or <4 x i1> %x, %y + %and = and <4 x i1> %x_inv, %or + ret <4 x i1> %and +} + +; Commute the 'and': +; ((X | Y) & ~X) -> (~X & Y), where 'not' is an inverted cmp + +define i1 @andn_or_cmp_4(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @andn_or_cmp_4( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[AND]] +; + %x = icmp eq i32 %a, %b + %x_inv = icmp ne i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %or = or i1 %x, %y + %and = and i1 %or, %x_inv + ret i1 %and +} diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll index 001ac58891e46..15772d158f624 100644 --- a/test/Transforms/InstCombine/and2.ll +++ b/test/Transforms/InstCombine/and2.ll @@ -98,8 +98,7 @@ define i64 @test9(i64 %x) { ; combine -x & 1 into x & 1 define <2 x i64> @test9vec(<2 x i64> %x) { ; CHECK-LABEL: @test9vec( -; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i64> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i64> [[SUB]], <i64 1, i64 1> +; CHECK-NEXT: [[AND:%.*]] = and <2 x i64> %x, <i64 1, i64 1> ; CHECK-NEXT: ret <2 x i64> [[AND]] ; %sub = sub nsw <2 x i64> <i64 0, i64 0>, %x @@ -119,6 +118,88 @@ define i64 @test10(i64 %x) { ret i64 %add } +; (1 << x) & 1 --> zext(x == 0) + +define i8 @and1_shl1_is_cmp_eq_0(i8 %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 %x, 0 +; CHECK-NEXT: [[AND:%.*]] = zext i1 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[AND]] +; + %sh = shl i8 1, %x + %and = and i8 %sh, 1 + ret i8 %and +} + +; Don't do it if the shift has another use. + +define i8 @and1_shl1_is_cmp_eq_0_multiuse(i8 %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_multiuse( +; CHECK-NEXT: [[SH:%.*]] = shl i8 1, %x +; CHECK-NEXT: [[AND:%.*]] = and i8 [[SH]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SH]], [[AND]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %sh = shl i8 1, %x + %and = and i8 %sh, 1 + %add = add i8 %sh, %and + ret i8 %add +} + +; (1 << x) & 1 --> zext(x == 0) + +define <2 x i8> @and1_shl1_is_cmp_eq_0_vec(<2 x i8> %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer +; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[AND]] +; + %sh = shl <2 x i8> <i8 1, i8 1>, %x + %and = and <2 x i8> %sh, <i8 1, i8 1> + ret <2 x i8> %and +} + +; (1 >> x) & 1 --> zext(x == 0) + +define i8 @and1_lshr1_is_cmp_eq_0(i8 %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 %x, 0 +; CHECK-NEXT: [[AND:%.*]] = zext i1 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[AND]] +; + %sh = lshr i8 1, %x + %and = and i8 %sh, 1 + ret i8 %and +} + +; Don't do it if the shift has another use. + +define i8 @and1_lshr1_is_cmp_eq_0_multiuse(i8 %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_multiuse( +; CHECK-NEXT: [[SH:%.*]] = lshr i8 1, %x +; CHECK-NEXT: [[AND:%.*]] = and i8 [[SH]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SH]], [[AND]] +; CHECK-NEXT: ret i8 [[ADD]] +; + %sh = lshr i8 1, %x + %and = and i8 %sh, 1 + %add = add i8 %sh, %and + ret i8 %add +} + +; (1 >> x) & 1 --> zext(x == 0) + +define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec(<2 x i8> %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer +; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[AND]] +; + %sh = lshr <2 x i8> <i8 1, i8 1>, %x + %and = and <2 x i8> %sh, <i8 1, i8 1> + ret <2 x i8> %and +} + ; The add in this test is unnecessary because the LSBs of the LHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits. define i32 @test11(i32 %a, i32 %b) { ; CHECK-LABEL: @test11( diff --git a/test/Transforms/InstCombine/element-atomic-memintrins.ll b/test/Transforms/InstCombine/element-atomic-memintrins.ll new file mode 100644 index 0000000000000..2e3bfd7b721d6 --- /dev/null +++ b/test/Transforms/InstCombine/element-atomic-memintrins.ll @@ -0,0 +1,98 @@ +;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have +;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to +;; verify that inst combine handles these intrinsics properly once they have been +;; added to that class hierarchy. + +; RUN: opt -instcombine -S < %s | FileCheck %s + +;; ---- memset ----- + +; Ensure 0-length memset isn't removed +define void @test_memset_zero_length(i8* %dest) { + ; CHECK-LABEL: test_memset_zero_length + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1) + ret void +} + +; Ensure that small-sized memsets don't convert to stores +define void @test_memset_to_store(i8* %dest) { + ; CHECK-LABEL: test_memset_to_store + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1) + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1) + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1) + ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1) + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1) + ret void +} + +declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture writeonly, i8, i32, i32) nounwind argmemonly + + +;; ========================================= +;; ----- memmove ------ + +; memmove from a global constant source does not become memcpy +@gconst = constant [8 x i8] c"0123456\00" +define void @test_memmove_to_memcpy(i8* %dest) { + ; CHECK-LABEL: test_memmove_to_memcpy + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1) + ret void +} + +define void @test_memmove_zero_length(i8* %dest, i8* %src) { + ; CHECK-LABEL: test_memmove_zero_length + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16) + ret void +} + +; memmove with src==dest is removed +define void @test_memmove_removed(i8* %srcdest, i32 %sz) { + ; CHECK-LABEL: test_memmove_removed + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16) + ret void +} + +; memmove with a small constant length is converted to a load/store pair +define void @test_memmove_loadstore(i8* %dest, i8* %src) { + ; CHECK-LABEL: test_memmove_loadstore + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) + ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) + ; CHECK-NEXT: ret void + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1) + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1) + ret void +} + +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly diff --git a/test/Transforms/InstCombine/icmp-logical.ll b/test/Transforms/InstCombine/icmp-logical.ll index faae2016e2075..aa95cc5a13164 100644 --- a/test/Transforms/InstCombine/icmp-logical.ll +++ b/test/Transforms/InstCombine/icmp-logical.ll @@ -1,159 +1,138 @@ ; RUN: opt -instcombine -S -o - %s | FileCheck %s define i1 @masked_and_notallzeroes(i32 %A) { -; CHECK-LABEL: @masked_and_notallzeroes -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp ne i32 [[MASK]], 0 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_and_notallzeroes( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp ne i32 [[MASK1]], 0 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp ne i32 %mask1, 0 - %mask2 = and i32 %A, 39 %tst2 = icmp ne i32 %mask2, 0 - %res = and i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_allzeroes(i32 %A) { -; CHECK-LABEL: @masked_or_allzeroes -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_allzeroes( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp eq i32 %mask1, 0 - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, 0 - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @masked_and_notallones(i32 %A) { -; CHECK-LABEL: @masked_and_notallones -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp ne i32 [[MASK]], 7 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_and_notallones( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp ne i32 [[MASK1]], 7 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp ne i32 %mask1, 7 - %mask2 = and i32 %A, 39 %tst2 = icmp ne i32 %mask2, 39 - %res = and i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_allones(i32 %A) { -; CHECK-LABEL: @masked_or_allones -; CHECK: [[MASK:%.*]] = and i32 %A, 7 -; CHECK: icmp eq i32 [[MASK]], 7 -; CHECK-NOT: and i32 %A, 39 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_allones( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 7 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASK1]], 7 +; CHECK-NEXT: ret i1 [[TST1]] +; %mask1 = and i32 %A, 7 %tst1 = icmp eq i32 %mask1, 7 - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, 39 - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @masked_and_notA(i32 %A) { -; CHECK-LABEL: @masked_and_notA -; CHECK: [[MASK:%.*]] = and i32 %A, 39 -; CHECK: icmp ne i32 [[MASK]], %A -; CHECK-NOT: and i32 %A, 7 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_and_notA( +; CHECK-NEXT: [[MASK2:%.*]] = and i32 %A, 39 +; CHECK-NEXT: [[TST2:%.*]] = icmp ne i32 [[MASK2]], %A +; CHECK-NEXT: ret i1 [[TST2]] +; %mask1 = and i32 %A, 7 %tst1 = icmp ne i32 %mask1, %A - %mask2 = and i32 %A, 39 %tst2 = icmp ne i32 %mask2, %A - %res = and i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_A(i32 %A) { -; CHECK-LABEL: @masked_or_A -; CHECK: [[MASK:%.*]] = and i32 %A, 39 -; CHECK: icmp eq i32 [[MASK]], %A -; CHECK-NOT: and i32 %A, 7 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_A( +; CHECK-NEXT: [[MASK2:%.*]] = and i32 %A, 39 +; CHECK-NEXT: [[TST2:%.*]] = icmp eq i32 [[MASK2]], %A +; CHECK-NEXT: ret i1 [[TST2]] +; %mask1 = and i32 %A, 7 %tst1 = icmp eq i32 %mask1, %A - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, %A - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @masked_or_allzeroes_notoptimised(i32 %A) { -; CHECK-LABEL: @masked_or_allzeroes_notoptimised -; CHECK: [[MASK:%.*]] = and i32 %A, 15 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK: [[MASK:%.*]] = and i32 %A, 39 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK: ret i1 - +; CHECK-LABEL: @masked_or_allzeroes_notoptimised( +; CHECK-NEXT: [[MASK1:%.*]] = and i32 %A, 15 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0 +; CHECK-NEXT: [[MASK2:%.*]] = and i32 %A, 39 +; CHECK-NEXT: [[TST2:%.*]] = icmp eq i32 [[MASK2]], 0 +; CHECK-NEXT: [[RES:%.*]] = or i1 [[TST1]], [[TST2]] +; CHECK-NEXT: ret i1 [[RES]] +; %mask1 = and i32 %A, 15 %tst1 = icmp eq i32 %mask1, 0 - %mask2 = and i32 %A, 39 %tst2 = icmp eq i32 %mask2, 0 - %res = or i1 %tst1, %tst2 ret i1 %res } define i1 @nomask_lhs(i32 %in) { -; CHECK-LABEL: @nomask_lhs -; CHECK: [[MASK:%.*]] = and i32 %in, 1 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK-NOT: icmp -; CHECK: ret i1 +; CHECK-LABEL: @nomask_lhs( +; CHECK-NEXT: [[MASKED:%.*]] = and i32 %in, 1 +; CHECK-NEXT: [[TST2:%.*]] = icmp eq i32 [[MASKED]], 0 +; CHECK-NEXT: ret i1 [[TST2]] +; %tst1 = icmp eq i32 %in, 0 - %masked = and i32 %in, 1 %tst2 = icmp eq i32 %masked, 0 - %val = or i1 %tst1, %tst2 ret i1 %val } - define i1 @nomask_rhs(i32 %in) { -; CHECK-LABEL: @nomask_rhs -; CHECK: [[MASK:%.*]] = and i32 %in, 1 -; CHECK: icmp eq i32 [[MASK]], 0 -; CHECK-NOT: icmp -; CHECK: ret i1 +; CHECK-LABEL: @nomask_rhs( +; CHECK-NEXT: [[MASKED:%.*]] = and i32 %in, 1 +; CHECK-NEXT: [[TST1:%.*]] = icmp eq i32 [[MASKED]], 0 +; CHECK-NEXT: ret i1 [[TST1]] +; %masked = and i32 %in, 1 %tst1 = icmp eq i32 %masked, 0 - %tst2 = icmp eq i32 %in, 0 - %val = or i1 %tst1, %tst2 ret i1 %val } +; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify. + define i1 @fold_mask_cmps_to_false(i32 %x) { -; CHECK-LABEL: @fold_mask_cmps_to_false -; CHECK: ret i1 false +; CHECK-LABEL: @fold_mask_cmps_to_false( +; CHECK-NEXT: ret i1 false +; %1 = and i32 %x, 2147483647 %2 = icmp eq i32 %1, 0 %3 = icmp eq i32 %x, 2147483647 @@ -161,12 +140,46 @@ define i1 @fold_mask_cmps_to_false(i32 %x) { ret i1 %4 } +; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify. + define i1 @fold_mask_cmps_to_true(i32 %x) { -; CHECK-LABEL: @fold_mask_cmps_to_true -; CHECK: ret i1 true +; CHECK-LABEL: @fold_mask_cmps_to_true( +; CHECK-NEXT: ret i1 true +; %1 = and i32 %x, 2147483647 %2 = icmp ne i32 %1, 0 %3 = icmp ne i32 %x, 2147483647 %4 = or i1 %3, %2 ret i1 %4 } + +; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401 + +define i1 @cmpeq_bitwise(i8 %a, i8 %b, i8 %c, i8 %d) { +; CHECK-LABEL: @cmpeq_bitwise( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 %c, %d +; CHECK-NEXT: [[CMP:%.*]] = and i1 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %xor1 = xor i8 %a, %b + %xor2 = xor i8 %c, %d + %or = or i8 %xor1, %xor2 + %cmp = icmp eq i8 %or, 0 + ret i1 %cmp +} + +define <2 x i1> @cmpne_bitwise(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { +; CHECK-LABEL: @cmpne_bitwise( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i64> %a, %b +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> %c, %d +; CHECK-NEXT: [[CMP:%.*]] = or <2 x i1> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %xor1 = xor <2 x i64> %a, %b + %xor2 = xor <2 x i64> %c, %d + %or = or <2 x i64> %xor1, %xor2 + %cmp = icmp ne <2 x i64> %or, zeroinitializer + ret <2 x i1> %cmp +} + diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll index 947971c6c83b0..be64f51b6c4c5 100644 --- a/test/Transforms/InstCombine/or-xor.ll +++ b/test/Transforms/InstCombine/or-xor.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s -define i32 @test1(i32 %x, i32 %y) nounwind { +; X | ~(X | Y) --> X | ~Y + +define i32 @test1(i32 %x, i32 %y) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor i32 %y, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y_NOT]], %x @@ -13,7 +15,10 @@ define i32 @test1(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test2(i32 %x, i32 %y) nounwind { +; Commute (rename) the inner 'or' operands: +; Y | ~(X | Y) --> ~X | Y + +define i32 @test2(i32 %x, i32 %y) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 %x, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[X_NOT]], %y @@ -25,7 +30,9 @@ define i32 @test2(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test3(i32 %x, i32 %y) nounwind { +; X | ~(X ^ Y) --> X | ~Y + +define i32 @test3(i32 %x, i32 %y) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor i32 %y, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y_NOT]], %x @@ -37,7 +44,10 @@ define i32 @test3(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test4(i32 %x, i32 %y) nounwind { +; Commute (rename) the 'xor' operands: +; Y | ~(X ^ Y) --> ~X | Y + +define i32 @test4(i32 %x, i32 %y) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 %x, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[X_NOT]], %y @@ -49,7 +59,7 @@ define i32 @test4(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test5(i32 %x, i32 %y) nounwind { +define i32 @test5(i32 %x, i32 %y) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: ret i32 -1 ; @@ -59,7 +69,7 @@ define i32 @test5(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test6(i32 %x, i32 %y) nounwind { +define i32 @test6(i32 %x, i32 %y) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: ret i32 -1 ; @@ -69,7 +79,7 @@ define i32 @test6(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test7(i32 %x, i32 %y) nounwind { +define i32 @test7(i32 %x, i32 %y) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[Z:%.*]] = or i32 %x, %y ; CHECK-NEXT: ret i32 [[Z]] @@ -79,7 +89,7 @@ define i32 @test7(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test8(i32 %x, i32 %y) nounwind { +define i32 @test8(i32 %x, i32 %y) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 %x, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[X_NOT]], %y @@ -91,7 +101,7 @@ define i32 @test8(i32 %x, i32 %y) nounwind { ret i32 %z } -define i32 @test9(i32 %x, i32 %y) nounwind { +define i32 @test9(i32 %x, i32 %y) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor i32 %y, -1 ; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y_NOT]], %x diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll index 764fe4503b5e1..fb56449ba4d46 100644 --- a/test/Transforms/InstCombine/or.ll +++ b/test/Transforms/InstCombine/or.ll @@ -397,14 +397,74 @@ define <2 x i132> @orsext_to_sel_vec_swap(<2 x i132> %x, <2 x i1> %y) { ret <2 x i132> %or } -define i32 @test39(i32 %a, i32 %b) { -; CHECK-LABEL: @test39( -; CHECK-NEXT: [[OR:%.*]] = or i32 %b, %a +; (~A & B) | A --> A | B + +define i32 @test39a(i32 %a, float %b) { +; CHECK-LABEL: @test39a( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] ; CHECK-NEXT: ret i32 [[OR]] ; - %xor = xor i32 %a, -1 - %and = and i32 %xor, %b - %or = or i32 %and, %a + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %nota, %b1 + %or = or i32 %and, %a1 + ret i32 %or +} + +; Commute 'and' operands: +; (B & ~A) | A --> A | B + +define i32 @test39b(i32 %a, float %b) { +; CHECK-LABEL: @test39b( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[OR]] +; + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %b1, %nota + %or = or i32 %and, %a1 + ret i32 %or +} + +; Commute 'or' operands: +; A | (~A & B) --> A | B + +define i32 @test39c(i32 %a, float %b) { +; CHECK-LABEL: @test39c( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[OR]] +; + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %nota, %b1 + %or = or i32 %a1, %and + ret i32 %or +} + +; Commute 'and' operands: +; A | (B & ~A) --> A | B + +define i32 @test39d(i32 %a, float %b) { +; CHECK-LABEL: @test39d( +; CHECK-NEXT: [[A1:%.*]] = mul i32 %a, 42 +; CHECK-NEXT: [[B1:%.*]] = bitcast float %b to i32 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[OR]] +; + %a1 = mul i32 %a, 42 ; thwart complexity-based ordering + %b1 = bitcast float %b to i32 ; thwart complexity-based ordering + %nota = xor i32 %a1, -1 + %and = and i32 %b1, %nota + %or = or i32 %a1, %and ret i32 %or } @@ -456,60 +516,6 @@ define i32 @test40d(i32 %a, i32 %b) { ret i32 %or } -define i32 @test41(i32 %a, i32 %b) { -; CHECK-LABEL: @test41( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %and = and i32 %a, %b - %nega = xor i32 %a, -1 - %xor = xor i32 %nega, %b - %or = or i32 %and, %xor - ret i32 %or -} - -; (~A ^ B) | (A & B) -> (~A ^ B) - -define i32 @test42(i32 %a, i32 %b) { -; CHECK-LABEL: @test42( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %nega = xor i32 %a, -1 - %xor = xor i32 %nega, %b - %and = and i32 %a, %b - %or = or i32 %xor, %and - ret i32 %or -} - -define i32 @test42_commuted_and(i32 %a, i32 %b) { -; CHECK-LABEL: @test42_commuted_and( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %nega = xor i32 %a, -1 - %xor = xor i32 %nega, %b - %and = and i32 %b, %a - %or = or i32 %xor, %and - ret i32 %or -} - -define i32 @test42_commuted_xor(i32 %a, i32 %b) { -; CHECK-LABEL: @test42_commuted_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, -1 -; CHECK-NEXT: [[OR:%.*]] = xor i32 [[TMP1]], %b -; CHECK-NEXT: ret i32 [[OR]] -; - %nega = xor i32 %a, -1 - %xor = xor i32 %b, %nega - %and = and i32 %a, %b - %or = or i32 %xor, %and - ret i32 %or -} - define i32 @test45(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test45( ; CHECK-NEXT: [[TMP1:%.*]] = and i32 %x, %z @@ -648,41 +654,146 @@ final: ret <2 x i32> %value } -define i8 @test51(i8 %a, i8 %b, i8 %c) { -; CHECK-LABEL: @test51( -; CHECK-NEXT: [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[X:%.*]] = or i8 [[W]], [[A:%.*]] -; CHECK-NEXT: ret i8 [[X]] +; In the next 4 tests, vary the types and predicates for extra coverage. +; (X | (Y & ~X)) -> (X | Y), where 'not' is an inverted cmp + +define i1 @or_andn_cmp_1(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @or_andn_cmp_1( +; CHECK-NEXT: [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[OR]] +; + %x = icmp sgt i32 %a, %b + %x_inv = icmp sle i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %and = and i1 %y, %x_inv + %or = or i1 %x, %and + ret i1 %or +} + +; Commute the 'or': +; ((Y & ~X) | X) -> (X | Y), where 'not' is an inverted cmp + +define <2 x i1> @or_andn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: @or_andn_cmp_2( +; CHECK-NEXT: [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], <i32 42, i32 47> +; CHECK-NEXT: [[OR:%.*]] = or <2 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <2 x i1> [[OR]] +; + %x = icmp sge <2 x i32> %a, %b + %x_inv = icmp slt <2 x i32> %a, %b + %y = icmp ugt <2 x i32> %c, <i32 42, i32 47> ; thwart complexity-based ordering + %and = and <2 x i1> %y, %x_inv + %or = or <2 x i1> %and, %x + ret <2 x i1> %or +} + +; Commute the 'and': +; (X | (~X & Y)) -> (X | Y), where 'not' is an inverted cmp + +define i1 @or_andn_cmp_3(i72 %a, i72 %b, i72 %c) { +; CHECK-LABEL: @or_andn_cmp_3( +; CHECK-NEXT: [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[OR]] +; + %x = icmp ugt i72 %a, %b + %x_inv = icmp ule i72 %a, %b + %y = icmp ugt i72 %c, 42 ; thwart complexity-based ordering + %and = and i1 %x_inv, %y + %or = or i1 %x, %and + ret i1 %or +} + +; Commute the 'or': +; ((~X & Y) | X) -> (X | Y), where 'not' is an inverted cmp + +define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) { +; CHECK-LABEL: @or_andn_cmp_4( +; CHECK-NEXT: [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], <i32 42, i32 43, i32 -1> +; CHECK-NEXT: [[OR:%.*]] = or <3 x i1> [[Y]], [[X]] +; CHECK-NEXT: ret <3 x i1> [[OR]] +; + %x = icmp eq <3 x i32> %a, %b + %x_inv = icmp ne <3 x i32> %a, %b + %y = icmp ugt <3 x i32> %c, <i32 42, i32 43, i32 -1> ; thwart complexity-based ordering + %and = and <3 x i1> %x_inv, %y + %or = or <3 x i1> %and, %x + ret <3 x i1> %or +} + +; In the next 4 tests, vary the types and predicates for extra coverage. +; (~X | (Y & X)) -> (~X | Y), where 'not' is an inverted cmp + +define i1 @orn_and_cmp_1(i37 %a, i37 %b, i37 %c) { +; CHECK-LABEL: @orn_and_cmp_1( +; CHECK-NEXT: [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[X_INV]], [[Y]] +; CHECK-NEXT: ret i1 [[OR]] +; + %x = icmp sgt i37 %a, %b + %x_inv = icmp sle i37 %a, %b + %y = icmp ugt i37 %c, 42 ; thwart complexity-based ordering + %and = and i1 %y, %x + %or = or i1 %x_inv, %and + ret i1 %or +} + +; Commute the 'or': +; ((Y & X) | ~X) -> (~X | Y), where 'not' is an inverted cmp + +define i1 @orn_and_cmp_2(i16 %a, i16 %b, i16 %c) { +; CHECK-LABEL: @orn_and_cmp_2( +; CHECK-NEXT: [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[OR]] ; - %w = mul i8 %b, %c - %z = xor i8 %a, -1 - %y = and i8 %w, %z - %x = or i8 %y, %a - ret i8 %x + %x = icmp sge i16 %a, %b + %x_inv = icmp slt i16 %a, %b + %y = icmp ugt i16 %c, 42 ; thwart complexity-based ordering + %and = and i1 %y, %x + %or = or i1 %and, %x_inv + ret i1 %or } -define i8 @test52(i8 %a, i8 %b, i8 %c) { -; CHECK-LABEL: @test52( -; CHECK-NEXT: [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[X:%.*]] = or i8 [[W]], [[A:%.*]] -; CHECK-NEXT: ret i8 [[X]] +; Commute the 'and': +; (~X | (X & Y)) -> (~X | Y), where 'not' is an inverted cmp + +define <4 x i1> @orn_and_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: @orn_and_cmp_3( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], <i32 42, i32 0, i32 1, i32 -1> +; CHECK-NEXT: [[OR:%.*]] = or <4 x i1> [[X_INV]], [[Y]] +; CHECK-NEXT: ret <4 x i1> [[OR]] ; - %w = mul i8 %b, %c - %z = xor i8 %w, -1 - %y = and i8 %z, %a - %x = or i8 %w, %y - ret i8 %x + %x = icmp ugt <4 x i32> %a, %b + %x_inv = icmp ule <4 x i32> %a, %b + %y = icmp ugt <4 x i32> %c, <i32 42, i32 0, i32 1, i32 -1> ; thwart complexity-based ordering + %and = and <4 x i1> %x, %y + %or = or <4 x i1> %x_inv, %and + ret <4 x i1> %or } -define i8 @test53(i8 %a, i8 %b, i8 %c) { -; CHECK-LABEL: @test53( -; CHECK-NEXT: [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[X:%.*]] = or i8 [[W]], [[A:%.*]] -; CHECK-NEXT: ret i8 [[X]] +; Commute the 'or': +; ((X & Y) | ~X) -> (~X | Y), where 'not' is an inverted cmp + +define i1 @orn_and_cmp_4(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @orn_and_cmp_4( +; CHECK-NEXT: [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42 +; CHECK-NEXT: [[OR:%.*]] = or i1 [[Y]], [[X_INV]] +; CHECK-NEXT: ret i1 [[OR]] ; - %w = mul i8 %b, %c - %z = xor i8 %w, -1 - %y = and i8 %z, %a - %x = or i8 %w, %y - ret i8 %x + %x = icmp eq i32 %a, %b + %x_inv = icmp ne i32 %a, %b + %y = icmp ugt i32 %c, 42 ; thwart complexity-based ordering + %and = and i1 %x, %y + %or = or i1 %and, %x_inv + ret i1 %or } diff --git a/test/Transforms/InstCombine/pr33765.ll b/test/Transforms/InstCombine/pr33765.ll new file mode 100644 index 0000000000000..99ed0d13b5cf5 --- /dev/null +++ b/test/Transforms/InstCombine/pr33765.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S %s -instcombine | FileCheck %s + +@glob = external global i16 + +define void @patatino(i8 %beth) { +; CHECK-LABEL: @patatino( +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[BETH:%.*]] to i32 +; CHECK-NEXT: br i1 undef, label [[IF_THEN9:%.*]], label [[IF_THEN9]] +; CHECK: if.then9: +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[TINKY:%.*]] = load i16, i16* @glob, align 2 +; CHECK-NEXT: [[CONV131:%.*]] = zext i16 [[TINKY]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[MUL]], [[CONV131]] +; CHECK-NEXT: [[CONV14:%.*]] = trunc i32 [[AND]] to i16 +; CHECK-NEXT: store i16 [[CONV14]], i16* @glob, align 2 +; CHECK-NEXT: ret void +; + %conv = zext i8 %beth to i32 + %mul = mul nuw nsw i32 %conv, %conv + %conv3 = and i32 %mul, 255 + %tobool8 = icmp ne i32 %mul, %conv3 + br i1 %tobool8, label %if.then9, label %if.then9 + +if.then9: + %tinky = load i16, i16* @glob + %conv13 = sext i16 %tinky to i32 + %and = and i32 %mul, %conv13 + %conv14 = trunc i32 %and to i16 + store i16 %conv14, i16* @glob + ret void +} diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll index 6a3cf7edd7dcd..5e84ec54971a0 100644 --- a/test/Transforms/JumpThreading/select.ll +++ b/test/Transforms/JumpThreading/select.ll @@ -280,10 +280,85 @@ cond.false.15.i: ; preds = %cond.false.10.i ret i32 %j.add3 ; CHECK-LABEL: @unfold3 -; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i +; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i ; CHECK: br i1 %cmp4.i, label %.exit.thread, label %cond.false.6.i ; CHECK: br i1 %cmp8.i, label %.exit.thread2, label %cond.false.10.i ; CHECK: br i1 %cmp13.i, label %.exit.thread, label %.exit ; CHECK: br i1 %phitmp, label %.exit.thread, label %.exit.thread2 ; CHECK: br label %.exit.thread2 } + +define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %cmp19.i = icmp sge i32 %y, %z + %conv = zext i1 %cmp19.i to i32 + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i32 [ 1, %entry ], [ 0, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 0, %cond.false.10.i ] + %lnot.i18 = icmp eq i32 %cond23.i, 1 + %j.add3 = select i1 %lnot.i18, i32 %j, i32 %add3 + ret i32 %j.add3 + +; CHECK-LABEL: @unfold4 +; CHECK: br i1 %cmp.i, label %.exit.thread, label %cond.false.i +; CHECK: br i1 %cmp4.i, label %.exit.thread3, label %cond.false.6.i +; CHECK: br i1 %cmp8.i, label %.exit.thread, label %cond.false.10.i +; CHECK: br i1 %cmp13.i, label %.exit.thread3, label %.exit +; CHECK: br i1 %lnot.i18, label %.exit.thread, label %.exit.thread3 +; CHECK: br label %.exit.thread3 +} + +define i32 @unfold5(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind { +entry: + %add3 = add nsw i32 %j, 2 + %cmp.i = icmp slt i32 %u, %v + br i1 %cmp.i, label %.exit, label %cond.false.i + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v + br i1 %cmp4.i, label %.exit, label %cond.false.6.i + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x + br i1 %cmp8.i, label %.exit, label %cond.false.10.i + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x + br i1 %cmp13.i, label %.exit, label %cond.false.15.i + +cond.false.15.i: ; preds = %cond.false.10.i + %cmp19.i = icmp sge i32 %y, %z + %conv = zext i1 %cmp19.i to i32 + br label %.exit + +.exit: ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i + %cond23.i = phi i32 [ 2, %entry ], [ 3, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 7, %cond.false.10.i ] + %lnot.i18 = icmp sgt i32 %cond23.i, 5 + %j.add3 = select i1 %lnot.i18, i32 %j, i32 %cond23.i + ret i32 %j.add3 + +; CHECK-LABEL: @unfold5 +; CHECK: br i1 %cmp.i, label %.exit, label %cond.false.i +; CHECK: br i1 %cmp4.i, label %.exit, label %cond.false.6.i +; CHECK: br i1 %cmp8.i, label %.exit, label %cond.false.10.i +; CHECK: br i1 %cmp13.i, label %.exit, label %cond.false.15.i +; CHECK: br label %.exit +} diff --git a/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll new file mode 100644 index 0000000000000..df6c6cfdbcb5d --- /dev/null +++ b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll @@ -0,0 +1,76 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@C = common global [100 x [100 x i32]] zeroinitializer + +;; FIXME: +;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported. +;; for(gi=1;gi<N;gi++) +;; for(gj=1;gj<M;gj++) +;; A[gj][gi] = A[gj - 1][gi] + C[gj][gi]; + +@gi = common global i32 0 +@gj = common global i32 0 + +define void @interchange_07(i32 %N, i32 %M){ +entry: + store i32 1, i32* @gi + %cmp21 = icmp sgt i32 %N, 1 + br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16 + +for.cond1.preheader.lr.ph: + %cmp218 = icmp sgt i32 %M, 1 + %gi.promoted = load i32, i32* @gi + %0 = add i32 %M, -1 + %1 = sext i32 %gi.promoted to i64 + %2 = sext i32 %N to i64 + %3 = add i32 %gi.promoted, 1 + %4 = icmp slt i32 %3, %N + %smax = select i1 %4, i32 %N, i32 %3 + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ] + br i1 %cmp218, label %for.body3, label %for.inc14 + +for.body3: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] + %5 = add nsw i64 %indvars.iv, -1 + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25 + %6 = load i32, i32* %arrayidx5 + %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25 + %7 = load i32, i32* %arrayidx9 + %add = add nsw i32 %7, %6 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25 + store i32 %add, i32* %arrayidx13 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.inc14, label %for.body3 + +for.inc14: + %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ] + %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1 + %cmp = icmp slt i64 %indvars.iv.next26, %2 + br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge + +for.cond.for.end16_crit_edge: + store i32 %inc.lcssa23, i32* @gj + store i32 %smax, i32* @gi + br label %for.end16 + +for.end16: + ret void +} + +; CHECK-LABEL: @interchange_07 +; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3 +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ] +; CHECK: %5 = add nsw i64 %indvars.iv, -1 +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25 +; CHECK: %6 = load i32, i32* %arrayidx5 +; CHECK: %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25 diff --git a/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll b/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll new file mode 100644 index 0000000000000..c3b0b9291424b --- /dev/null +++ b/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll @@ -0,0 +1,118 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer +@C = common global [100 x [100 x i32]] zeroinitializer +@D = common global [100 x [100 x [100 x i32]]] zeroinitializer + +;; Test that a flow dependency in outer loop doesn't prevent interchange in +;; loops i and j. +;; +;; for (int k = 0; k < 100; ++k) { +;; T[k] = fn1(); +;; for (int i = 0; i < 1000; ++i) +;; for(int j = 1; j < 1000; ++j) +;; Arr[j][i] = Arr[j][i]+k; +;; fn2(T[k]); +;; } + +@T = internal global [100 x double] zeroinitializer, align 4 +@Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4 + +define void @interchange_09(i32 %k) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup4 + ret void + +for.body: ; preds = %for.cond.cleanup4, %entry + %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ] + %call = call double @fn1() + %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45 + store double %call, double* %arrayidx, align 8 + br label %for.cond6.preheader + +for.cond6.preheader: ; preds = %for.cond.cleanup8, %for.body + %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ] + br label %for.body9 + +for.cond.cleanup4: ; preds = %for.cond.cleanup8 + %tmp = load double, double* %arrayidx, align 8 + call void @fn2(double %tmp) + %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 + %exitcond47 = icmp ne i64 %indvars.iv.next46, 100 + br i1 %exitcond47, label %for.body, label %for.cond.cleanup + +for.cond.cleanup8: ; preds = %for.body9 + %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 + %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000 + br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4 + +for.body9: ; preds = %for.body9, %for.cond6.preheader + %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ] + %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42 + %tmp1 = load i32, i32* %arrayidx13, align 4 + %tmp2 = trunc i64 %indvars.iv45 to i32 + %add = add nsw i32 %tmp1, %tmp2 + store i32 %add, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.body9, label %for.cond.cleanup8 +} + +declare double @fn1() +declare void @fn2(double) + + +;; After interchange %indvars.iv (j) should increment as the middle loop. +;; After interchange %indvars.iv42 (i) should increment with the inner most loop. + +; CHECK-LABEL: @interchange_09 + +; CHECK: for.body: +; CHECK: %indvars.iv45 = phi i64 [ %indvars.iv.next46, %for.cond.cleanup4 ], [ 0, %for.body.preheader ] +; CHECK: %call = call double @fn1() +; CHECK: %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45 +; CHECK: store double %call, double* %arrayidx, align 8 +; CHECK: br label %for.body9.preheader + +; CHECK: for.cond6.preheader.preheader: +; CHECK: br label %for.cond6.preheader + +; CHECK: for.cond6.preheader: +; CHECK: %indvars.iv42 = phi i64 [ %indvars.iv.next43, %for.cond.cleanup8 ], [ 0, %for.cond6.preheader.preheader ] +; CHECK: br label %for.body9.split1 + +; CHECK: for.body9.preheader: +; CHECK: br label %for.body9 + +; CHECK: for.cond.cleanup4: +; CHECK: %tmp = load double, double* %arrayidx, align 8 +; CHECK: call void @fn2(double %tmp) +; CHECK: %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 +; CHECK: %exitcond47 = icmp ne i64 %indvars.iv.next46, 100 +; CHECK: br i1 %exitcond47, label %for.body, label %for.cond.cleanup + +; CHECK: for.cond.cleanup8: +; CHECK: %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 +; CHECK: %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000 +; CHECK: br i1 %exitcond44, label %for.cond6.preheader, label %for.body9.split + +; CHECK: for.body9: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body9.split ], [ 1, %for.body9.preheader ] +; CHECK: br label %for.cond6.preheader.preheader + +; CHECK: for.body9.split1: +; CHECK: %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42 +; CHECK: store i32 %add, i32* %arrayidx13, align 4 +; CHECK: br label %for.cond.cleanup8 + +; CHECK: for.body9.split: +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond = icmp ne i64 %indvars.iv.next, 1000 +; CHECK: br i1 %exitcond, label %for.body9, label %for.cond.cleanup4 diff --git a/test/Transforms/LoopInterchange/interchange-not-profitable.ll b/test/Transforms/LoopInterchange/interchange-not-profitable.ll new file mode 100644 index 0000000000000..67a63cab08bd1 --- /dev/null +++ b/test/Transforms/LoopInterchange/interchange-not-profitable.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer + +;; Loops should not be interchanged in this case as it is not profitable. +;; for(int i=0;i<100;i++) +;; for(int j=0;j<100;j++) +;; A[i][j] = A[i][j]+k; + +define void @interchange_03(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv + %0 = load i32, i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.inc10, label %for.body3 + +for.inc10: + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 + br i1 %exitcond23, label %for.end12, label %for.cond1.preheader + +for.end12: + ret void +} + +; CHECK-LABEL: @interchange_03 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader.preheader +; CHECK: for.cond1.preheader.preheader: ; preds = %entry +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc10 +; CHECK: %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: br label %for.body3.preheader +; CHECK: for.body3.preheader: ; preds = %for.cond1.preheader +; CHECK: br label %for.body3 +; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3 +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 100 +; CHECK: br i1 %exitcond, label %for.inc10, label %for.body3 +; CHECK: for.inc10: ; preds = %for.body3 +; CHECK: %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 +; CHECK: %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 +; CHECK: br i1 %exitcond23, label %for.end12, label %for.cond1.preheader +; CHECK: for.end12: ; preds = %for.inc10 +; CHECK: ret void diff --git a/test/Transforms/LoopInterchange/interchange-output-dependencies.ll b/test/Transforms/LoopInterchange/interchange-output-dependencies.ll new file mode 100644 index 0000000000000..98deba96f8c6f --- /dev/null +++ b/test/Transforms/LoopInterchange/interchange-output-dependencies.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer + +;; Test to make sure we can handle output dependencies. +;; +;; for (int i = 0; i < 2; ++i) +;; for(int j = 0; j < 3; ++j) { +;; A[j][i] = i; +;; A[j][i+1] = j; +;; } + +@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16 + +define void @interchange_10() { +entry: + br label %for.cond1.preheader + +for.cond.loopexit: ; preds = %for.body4 + %exitcond28 = icmp ne i64 %indvars.iv.next27, 2 + br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup + +for.cond1.preheader: ; preds = %for.cond.loopexit, %entry + %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ] + %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1 + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.loopexit + ret void + +for.body4: ; preds = %for.body4, %for.cond1.preheader + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ] + %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26 + %tmp = trunc i64 %indvars.iv26 to i32 + store i32 %tmp, i32* %arrayidx6, align 4 + %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27 + %tmp1 = trunc i64 %indvars.iv to i32 + store i32 %tmp1, i32* %arrayidx10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 3 + br i1 %exitcond, label %for.body4, label %for.cond.loopexit +} + +; CHECK-LABEL: @interchange_10 +; CHECK: entry: +; CHECK: br label %for.body4.preheader + +; CHECK: for.cond1.preheader.preheader: +; CHECK: br label %for.cond1.preheader + +; CHECK: for.cond.loopexit: +; CHECK: %exitcond28 = icmp ne i64 %indvars.iv.next27, 2 +; CHECK: br i1 %exitcond28, label %for.cond1.preheader, label %for.body4.split + +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond.loopexit ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1 +; CHECK: br label %for.body4.split1 + +; CHECK: for.body4.preheader: +; CHECK: br label %for.body4 + +; CHECK: for.cond.cleanup: +; CHECK: ret void + +; CHECK: for.body4: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.split ], [ 0, %for.body4.preheader ] +; CHECK: br label %for.cond1.preheader.preheader + +; CHECK: for.body4.split1: +; CHECK: %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26 +; CHECK: %tmp = trunc i64 %indvars.iv26 to i32 +; CHECK: store i32 %tmp, i32* %arrayidx6, align 4 +; CHECK: %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27 +; CHECK: %tmp1 = trunc i64 %indvars.iv to i32 +; CHECK: store i32 %tmp1, i32* %arrayidx10, align 4 +; CHECK: br label %for.cond.loopexit + +; CHECK: for.body4.split: +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %exitcond = icmp ne i64 %indvars.iv.next, 3 +; CHECK: br i1 %exitcond, label %for.body4, label %for.cond.cleanup diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-down.ll b/test/Transforms/LoopInterchange/interchange-simple-count-down.ll new file mode 100644 index 0000000000000..70ba5940257f5 --- /dev/null +++ b/test/Transforms/LoopInterchange/interchange-simple-count-down.ll @@ -0,0 +1,69 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer + +;; for(int i=0;i<100;i++) +;; for(int j=100;j>=0;j--) +;; A[j][i] = A[j][i]+k; + +define void @interchange_02(i32 %k) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 + %0 = load i32, i32* %arrayidx5 + %add = add nsw i32 %0, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp2 = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp2, label %for.body3, label %for.inc10 + +for.inc10: + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond = icmp eq i64 %indvars.iv.next20, 100 + br i1 %exitcond, label %for.end11, label %for.cond1.preheader + +for.end11: + ret void +} + +; CHECK-LABEL: @interchange_02 +; CHECK: entry: +; CHECK: br label %for.body3.preheader +; CHECK: for.cond1.preheader.preheader: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: br label %for.body3.split1 +; CHECK: for.body3.preheader: +; CHECK: br label %for.body3 +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ] +; CHECK: br label %for.cond1.preheader.preheader +; CHECK: for.body3.split1: ; preds = %for.cond1.preheader +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add = add nsw i32 %0, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: br label %for.inc10 +; CHECK: for.body3.split: +; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1 +; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0 +; CHECK: br i1 %cmp2, label %for.body3, label %for.end11 +; CHECK: for.inc10: +; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100 +; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader +; CHECK: for.end11: +; CHECK: ret void diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-up.ll b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll new file mode 100644 index 0000000000000..4febe0269810d --- /dev/null +++ b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer + +;; for(int i=0;i<N;i++) +;; for(int j=1;j<N;j++) +;; A[j][i] = A[j][i]+k; + +define void @interchange_01(i32 %k, i32 %N) { +entry: + %cmp21 = icmp sgt i32 %N, 0 + br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12 + +for.cond1.preheader.lr.ph: + %cmp219 = icmp sgt i32 %N, 1 + %0 = add i32 %N, -1 + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ] + br i1 %cmp219, label %for.body3, label %for.inc10 + +for.body3: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23 + %1 = load i32, i32* %arrayidx5 + %add = add nsw i32 %1, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.inc10, label %for.body3 + +for.inc10: + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32 + %exitcond26 = icmp eq i32 %lftr.wideiv25, %0 + br i1 %exitcond26, label %for.end12, label %for.cond1.preheader + +for.end12: + ret void +} + +; CHECK-LABEL: @interchange_01 +; CHECK: entry: +; CHECK: %cmp21 = icmp sgt i32 %N, 0 +; CHECK: br i1 %cmp21, label %for.body3.preheader, label %for.end12 +; CHECK: for.cond1.preheader.lr.ph: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: +; CHECK: %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ] +; CHECK: br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit +; CHECK: for.body3.preheader: +; CHECK: %cmp219 = icmp sgt i32 %N, 1 +; CHECK: %0 = add i32 %N, -1 +; CHECK: br label %for.body3 +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ] +; CHECK: br label %for.cond1.preheader.lr.ph +; CHECK: for.body3.split1: +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23 +; CHECK: %1 = load i32, i32* %arrayidx5 +; CHECK: %add = add nsw i32 %1, %k +; CHECK: store i32 %add, i32* %arrayidx5 +; CHECK: br label %for.inc10.loopexit +; CHECK: for.body3.split: +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32 +; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0 +; CHECK: br i1 %exitcond, label %for.end12.loopexit, label %for.body3 +; CHECK: for.inc10.loopexit: +; CHECK: br label %for.inc10 +; CHECK: for.inc10: +; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 +; CHECK: %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32 +; CHECK: %exitcond26 = icmp eq i32 %lftr.wideiv25, %0 +; CHECK: br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader +; CHECK: for.end12.loopexit: +; CHECK: br label %for.end12 +; CHECK: for.end12: +; CHECK: ret void diff --git a/test/Transforms/LoopInterchange/interchange.ll b/test/Transforms/LoopInterchange/interchange.ll deleted file mode 100644 index 77b33e43bedc7..0000000000000 --- a/test/Transforms/LoopInterchange/interchange.ll +++ /dev/null @@ -1,749 +0,0 @@ -; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s -;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@A = common global [100 x [100 x i32]] zeroinitializer -@B = common global [100 x i32] zeroinitializer -@C = common global [100 x [100 x i32]] zeroinitializer -@D = common global [100 x [100 x [100 x i32]]] zeroinitializer - -declare void @foo(...) - -;;--------------------------------------Test case 01------------------------------------ -;; for(int i=0;i<N;i++) -;; for(int j=1;j<N;j++) -;; A[j][i] = A[j][i]+k; - -define void @interchange_01(i32 %k, i32 %N) { -entry: - %cmp21 = icmp sgt i32 %N, 0 - br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12 - -for.cond1.preheader.lr.ph: - %cmp219 = icmp sgt i32 %N, 1 - %0 = add i32 %N, -1 - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ] - br i1 %cmp219, label %for.body3, label %for.inc10 - -for.body3: - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23 - %1 = load i32, i32* %arrayidx5 - %add = add nsw i32 %1, %k - store i32 %add, i32* %arrayidx5 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %0 - br i1 %exitcond, label %for.inc10, label %for.body3 - -for.inc10: - %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 - %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32 - %exitcond26 = icmp eq i32 %lftr.wideiv25, %0 - br i1 %exitcond26, label %for.end12, label %for.cond1.preheader - -for.end12: - ret void -} - -; CHECK-LABEL: @interchange_01 -; CHECK: entry: -; CHECK: %cmp21 = icmp sgt i32 %N, 0 -; CHECK: br i1 %cmp21, label %for.body3.preheader, label %for.end12 -; CHECK: for.cond1.preheader.lr.ph: -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: -; CHECK: %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ] -; CHECK: br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit -; CHECK: for.body3.preheader: -; CHECK: %cmp219 = icmp sgt i32 %N, 1 -; CHECK: %0 = add i32 %N, -1 -; CHECK: br label %for.body3 -; CHECK: for.body3: -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ] -; CHECK: br label %for.cond1.preheader.lr.ph -; CHECK: for.body3.split1: -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23 -; CHECK: %1 = load i32, i32* %arrayidx5 -; CHECK: %add = add nsw i32 %1, %k -; CHECK: store i32 %add, i32* %arrayidx5 -; CHECK: br label %for.inc10.loopexit -; CHECK: for.body3.split: -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32 -; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0 -; CHECK: br i1 %exitcond, label %for.end12.loopexit, label %for.body3 -; CHECK: for.inc10.loopexit: -; CHECK: br label %for.inc10 -; CHECK: for.inc10: -; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 -; CHECK: %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32 -; CHECK: %exitcond26 = icmp eq i32 %lftr.wideiv25, %0 -; CHECK: br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader -; CHECK: for.end12.loopexit: -; CHECK: br label %for.end12 -; CHECK: for.end12: -; CHECK: ret void - -;;--------------------------------------Test case 02------------------------------------- - -;; for(int i=0;i<100;i++) -;; for(int j=100;j>=0;j--) -;; A[j][i] = A[j][i]+k; - -define void @interchange_02(i32 %k) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ] - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 - %0 = load i32, i32* %arrayidx5 - %add = add nsw i32 %0, %k - store i32 %add, i32* %arrayidx5 - %indvars.iv.next = add nsw i64 %indvars.iv, -1 - %cmp2 = icmp sgt i64 %indvars.iv, 0 - br i1 %cmp2, label %for.body3, label %for.inc10 - -for.inc10: - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond = icmp eq i64 %indvars.iv.next20, 100 - br i1 %exitcond, label %for.end11, label %for.cond1.preheader - -for.end11: - ret void -} - -; CHECK-LABEL: @interchange_02 -; CHECK: entry: -; CHECK: br label %for.body3.preheader -; CHECK: for.cond1.preheader.preheader: -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: -; CHECK: %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] -; CHECK: br label %for.body3.split1 -; CHECK: for.body3.preheader: -; CHECK: br label %for.body3 -; CHECK: for.body3: -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ] -; CHECK: br label %for.cond1.preheader.preheader -; CHECK: for.body3.split1: ; preds = %for.cond1.preheader -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19 -; CHECK: %0 = load i32, i32* %arrayidx5 -; CHECK: %add = add nsw i32 %0, %k -; CHECK: store i32 %add, i32* %arrayidx5 -; CHECK: br label %for.inc10 -; CHECK: for.body3.split: -; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0 -; CHECK: br i1 %cmp2, label %for.body3, label %for.end11 -; CHECK: for.inc10: -; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 -; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100 -; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader -; CHECK: for.end11: -; CHECK: ret void - -;;--------------------------------------Test case 03------------------------------------- -;; Loops should not be interchanged in this case as it is not profitable. -;; for(int i=0;i<100;i++) -;; for(int j=0;j<100;j++) -;; A[i][j] = A[i][j]+k; - -define void @interchange_03(i32 %k) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ] - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv - %0 = load i32, i32* %arrayidx5 - %add = add nsw i32 %0, %k - store i32 %add, i32* %arrayidx5 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.inc10, label %for.body3 - -for.inc10: - %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 - %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 - br i1 %exitcond23, label %for.end12, label %for.cond1.preheader - -for.end12: - ret void -} - -; CHECK-LABEL: @interchange_03 -; CHECK: entry: -; CHECK: br label %for.cond1.preheader.preheader -; CHECK: for.cond1.preheader.preheader: ; preds = %entry -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc10 -; CHECK: %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ] -; CHECK: br label %for.body3.preheader -; CHECK: for.body3.preheader: ; preds = %for.cond1.preheader -; CHECK: br label %for.body3 -; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3 -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv -; CHECK: %0 = load i32, i32* %arrayidx5 -; CHECK: %add = add nsw i32 %0, %k -; CHECK: store i32 %add, i32* %arrayidx5 -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 100 -; CHECK: br i1 %exitcond, label %for.inc10, label %for.body3 -; CHECK: for.inc10: ; preds = %for.body3 -; CHECK: %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 -; CHECK: %exitcond23 = icmp eq i64 %indvars.iv.next22, 100 -; CHECK: br i1 %exitcond23, label %for.end12, label %for.cond1.preheader -; CHECK: for.end12: ; preds = %for.inc10 -; CHECK: ret void - - -;;--------------------------------------Test case 04------------------------------------- -;; Loops should not be interchanged in this case as it is not legal due to dependency. -;; for(int j=0;j<99;j++) -;; for(int i=0;i<99;i++) -;; A[j][i+1] = A[j+1][i]+k; - -define void @interchange_04(i32 %k){ -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] - %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv - %0 = load i32, i32* %arrayidx5 - %add6 = add nsw i32 %0, %k - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next - store i32 %add6, i32* %arrayidx11 - %exitcond = icmp eq i64 %indvars.iv.next, 99 - br i1 %exitcond, label %for.inc12, label %for.body3 - -for.inc12: - %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 - br i1 %exitcond25, label %for.end14, label %for.cond1.preheader - -for.end14: - ret void -} - -; CHECK-LABEL: @interchange_04 -; CHECK: entry: -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: ; preds = %for.inc12, %entry -; CHECK: %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] -; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 -; CHECK: br label %for.body3 -; CHECK: for.body3: ; preds = %for.body3, %for.cond1.preheader -; CHECK: %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv -; CHECK: %0 = load i32, i32* %arrayidx5 -; CHECK: %add6 = add nsw i32 %0, %k -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next -; CHECK: store i32 %add6, i32* %arrayidx11 -; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 99 -; CHECK: br i1 %exitcond, label %for.inc12, label %for.body3 -; CHECK: for.inc12: ; preds = %for.body3 -; CHECK: %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 -; CHECK: br i1 %exitcond25, label %for.end14, label %for.cond1.preheader -; CHECK: for.end14: ; preds = %for.inc12 -; CHECK: ret void - - - -;;--------------------------------------Test case 05------------------------------------- -;; Loops not tightly nested are not interchanged -;; for(int j=0;j<N;j++) { -;; B[j] = j+k; -;; for(int i=0;i<N;i++) -;; A[j][i] = A[j][i]+B[j]; -;; } - -define void @interchange_05(i32 %k, i32 %N){ -entry: - %cmp30 = icmp sgt i32 %N, 0 - br i1 %cmp30, label %for.body.lr.ph, label %for.end17 - -for.body.lr.ph: - %0 = add i32 %N, -1 - %1 = zext i32 %k to i64 - br label %for.body - -for.body: - %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ] - %2 = add nsw i64 %indvars.iv32, %1 - %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32 - %3 = trunc i64 %2 to i32 - store i32 %3, i32* %arrayidx - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] - %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv - %4 = load i32, i32* %arrayidx7 - %add10 = add nsw i32 %3, %4 - store i32 %add10, i32* %arrayidx7 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %0 - br i1 %exitcond, label %for.inc15, label %for.body3 - -for.inc15: - %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1 - %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32 - %exitcond36 = icmp eq i32 %lftr.wideiv35, %0 - br i1 %exitcond36, label %for.end17, label %for.body - -for.end17: - ret void -} - -; CHECK-LABEL: @interchange_05 -; CHECK: entry: -; CHECK: %cmp30 = icmp sgt i32 %N, 0 -; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17 -; CHECK: for.body.lr.ph: -; CHECK: %0 = add i32 %N, -1 -; CHECK: %1 = zext i32 %k to i64 -; CHECK: br label %for.body -; CHECK: for.body: -; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ] -; CHECK: %2 = add nsw i64 %indvars.iv32, %1 -; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32 -; CHECK: %3 = trunc i64 %2 to i32 -; CHECK: store i32 %3, i32* %arrayidx -; CHECK: br label %for.body3.preheader -; CHECK: for.body3.preheader: -; CHECK: br label %for.body3 -; CHECK: for.body3: -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] -; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv -; CHECK: %4 = load i32, i32* %arrayidx7 -; CHECK: %add10 = add nsw i32 %3, %4 -; CHECK: store i32 %add10, i32* %arrayidx7 -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32 -; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0 -; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3 -; CHECK: for.inc15: -; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1 -; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32 -; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0 -; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body -; CHECK: for.end17.loopexit: -; CHECK: br label %for.end17 -; CHECK: for.end17: -; CHECK: ret void - - -;;--------------------------------------Test case 06------------------------------------- -;; Loops not tightly nested are not interchanged -;; for(int j=0;j<N;j++) { -;; foo(); -;; for(int i=2;i<N;i++) -;; A[j][i] = A[j][i]+k; -;; } - -define void @interchange_06(i32 %k, i32 %N) { -entry: - %cmp22 = icmp sgt i32 %N, 0 - br i1 %cmp22, label %for.body.lr.ph, label %for.end12 - -for.body.lr.ph: - %0 = add i32 %N, -1 - br label %for.body - -for.body: - %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ] - tail call void (...) @foo() - br label %for.body3 - -for.body3: - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ] - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv - %1 = load i32, i32* %arrayidx5 - %add = add nsw i32 %1, %k - store i32 %add, i32* %arrayidx5 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %0 - br i1 %exitcond, label %for.inc10, label %for.body3 - -for.inc10: - %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1 - %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32 - %exitcond27 = icmp eq i32 %lftr.wideiv26, %0 - br i1 %exitcond27, label %for.end12, label %for.body - -for.end12: - ret void -} -;; Here we are checking if the inner phi is not split then we have not interchanged. -; CHECK-LABEL: @interchange_06 -; CHECK: phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ] -; CHECK-NEXT: getelementptr -; CHECK-NEXT: %1 = load - -;;--------------------------------------Test case 07------------------------------------- -;; FIXME: -;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported. -;; for(gi=1;gi<N;gi++) -;; for(gj=1;gj<M;gj++) -;; A[gj][gi] = A[gj - 1][gi] + C[gj][gi]; - -@gi = common global i32 0 -@gj = common global i32 0 - -define void @interchange_07(i32 %N, i32 %M){ -entry: - store i32 1, i32* @gi - %cmp21 = icmp sgt i32 %N, 1 - br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16 - -for.cond1.preheader.lr.ph: - %cmp218 = icmp sgt i32 %M, 1 - %gi.promoted = load i32, i32* @gi - %0 = add i32 %M, -1 - %1 = sext i32 %gi.promoted to i64 - %2 = sext i32 %N to i64 - %3 = add i32 %gi.promoted, 1 - %4 = icmp slt i32 %3, %N - %smax = select i1 %4, i32 %N, i32 %3 - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ] - br i1 %cmp218, label %for.body3, label %for.inc14 - -for.body3: - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ] - %5 = add nsw i64 %indvars.iv, -1 - %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25 - %6 = load i32, i32* %arrayidx5 - %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25 - %7 = load i32, i32* %arrayidx9 - %add = add nsw i32 %7, %6 - %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25 - store i32 %add, i32* %arrayidx13 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %0 - br i1 %exitcond, label %for.inc14, label %for.body3 - -for.inc14: - %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ] - %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1 - %cmp = icmp slt i64 %indvars.iv.next26, %2 - br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge - -for.cond.for.end16_crit_edge: - store i32 %inc.lcssa23, i32* @gj - store i32 %smax, i32* @gi - br label %for.end16 - -for.end16: - ret void -} - -; CHECK-LABEL: @interchange_07 -; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3 -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ] -; CHECK: %5 = add nsw i64 %indvars.iv, -1 -; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25 -; CHECK: %6 = load i32, i32* %arrayidx5 -; CHECK: %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25 - -;;------------------------------------------------Test case 08------------------------------- -;; Test for interchange in loop nest greater than 2. -;; for(int i=0;i<100;i++) -;; for(int j=0;j<100;j++) -;; for(int k=0;k<100;k++) -;; D[i][k][j] = D[i][k][j]+t; - -define void @interchange_08(i32 %t){ -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.inc15, %entry - %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ] - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader - %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] - br label %for.body6 - -for.body6: ; preds = %for.body6, %for.cond4.preheader - %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] - %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027 - %0 = load i32, i32* %arrayidx8 - %add = add nsw i32 %0, %t - store i32 %add, i32* %arrayidx8 - %inc = add nuw nsw i32 %k.026, 1 - %exitcond = icmp eq i32 %inc, 100 - br i1 %exitcond, label %for.inc12, label %for.body6 - -for.inc12: ; preds = %for.body6 - %inc13 = add nuw nsw i32 %j.027, 1 - %exitcond29 = icmp eq i32 %inc13, 100 - br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader - -for.inc15: ; preds = %for.inc12 - %inc16 = add nuw nsw i32 %i.028, 1 - %exitcond30 = icmp eq i32 %inc16, 100 - br i1 %exitcond30, label %for.end17, label %for.cond1.preheader - -for.end17: ; preds = %for.inc15 - ret void -} -; CHECK-LABEL: @interchange_08 -; CHECK: entry: -; CHECK: br label %for.cond1.preheader.preheader -; CHECK: for.cond1.preheader.preheader: ; preds = %entry -; CHECK: br label %for.cond1.preheader -; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc15 -; CHECK: %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ] -; CHECK: br label %for.body6.preheader -; CHECK: for.cond4.preheader.preheader: ; preds = %for.body6 -; CHECK: br label %for.cond4.preheader -; CHECK: for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc12 -; CHECK: %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ] -; CHECK: br label %for.body6.split1 -; CHECK: for.body6.preheader: ; preds = %for.cond1.preheader -; CHECK: br label %for.body6 -; CHECK: for.body6: ; preds = %for.body6.preheader, %for.body6.split -; CHECK: %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ] -; CHECK: br label %for.cond4.preheader.preheader -; CHECK: for.body6.split1: ; preds = %for.cond4.preheader -; CHECK: %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027 -; CHECK: %0 = load i32, i32* %arrayidx8 -; CHECK: %add = add nsw i32 %0, %t -; CHECK: store i32 %add, i32* %arrayidx8 -; CHECK: br label %for.inc12 -; CHECK: for.body6.split: ; preds = %for.inc12 -; CHECK: %inc = add nuw nsw i32 %k.026, 1 -; CHECK: %exitcond = icmp eq i32 %inc, 100 -; CHECK: br i1 %exitcond, label %for.inc15, label %for.body6 -; CHECK: for.inc12: ; preds = %for.body6.split1 -; CHECK: %inc13 = add nuw nsw i32 %j.027, 1 -; CHECK: %exitcond29 = icmp eq i32 %inc13, 100 -; CHECK: br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader -; CHECK: for.inc15: ; preds = %for.body6.split -; CHECK: %inc16 = add nuw nsw i32 %i.028, 1 -; CHECK: %exitcond30 = icmp eq i32 %inc16, 100 -; CHECK: br i1 %exitcond30, label %for.end17, label %for.cond1.preheader -; CHECK: for.end17: ; preds = %for.inc15 -; CHECK: ret void - -;;-----------------------------------Test case 09------------------------------- -;; Test that a flow dependency in outer loop doesn't prevent interchange in -;; loops i and j. -;; -;; for (int k = 0; k < 100; ++k) { -;; T[k] = fn1(); -;; for (int i = 0; i < 1000; ++i) -;; for(int j = 1; j < 1000; ++j) -;; Arr[j][i] = Arr[j][i]+k; -;; fn2(T[k]); -;; } - -@T = internal global [100 x double] zeroinitializer, align 4 -@Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4 - -define void @interchange_09(i32 %k) { -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.cond.cleanup4 - ret void - -for.body: ; preds = %for.cond.cleanup4, %entry - %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ] - %call = call double @fn1() - %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45 - store double %call, double* %arrayidx, align 8 - br label %for.cond6.preheader - -for.cond6.preheader: ; preds = %for.cond.cleanup8, %for.body - %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ] - br label %for.body9 - -for.cond.cleanup4: ; preds = %for.cond.cleanup8 - %tmp = load double, double* %arrayidx, align 8 - call void @fn2(double %tmp) - %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 - %exitcond47 = icmp ne i64 %indvars.iv.next46, 100 - br i1 %exitcond47, label %for.body, label %for.cond.cleanup - -for.cond.cleanup8: ; preds = %for.body9 - %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 - %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000 - br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4 - -for.body9: ; preds = %for.body9, %for.cond6.preheader - %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ] - %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42 - %tmp1 = load i32, i32* %arrayidx13, align 4 - %tmp2 = trunc i64 %indvars.iv45 to i32 - %add = add nsw i32 %tmp1, %tmp2 - store i32 %add, i32* %arrayidx13, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %for.body9, label %for.cond.cleanup8 -} - -declare double @fn1() -declare void @fn2(double) - - - - - -;; After interchange %indvars.iv (j) should increment as the middle loop. -;; After interchange %indvars.iv42 (i) should increment with the inner most loop. - -; CHECK-LABEL: @interchange_09 - -; CHECK: for.body: -; CHECK: %indvars.iv45 = phi i64 [ %indvars.iv.next46, %for.cond.cleanup4 ], [ 0, %for.body.preheader ] -; CHECK: %call = call double @fn1() -; CHECK: %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45 -; CHECK: store double %call, double* %arrayidx, align 8 -; CHECK: br label %for.body9.preheader - -; CHECK: for.cond6.preheader.preheader: -; CHECK: br label %for.cond6.preheader - -; CHECK: for.cond6.preheader: -; CHECK: %indvars.iv42 = phi i64 [ %indvars.iv.next43, %for.cond.cleanup8 ], [ 0, %for.cond6.preheader.preheader ] -; CHECK: br label %for.body9.split1 - -; CHECK: for.body9.preheader: -; CHECK: br label %for.body9 - -; CHECK: for.cond.cleanup4: -; CHECK: %tmp = load double, double* %arrayidx, align 8 -; CHECK: call void @fn2(double %tmp) -; CHECK: %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 -; CHECK: %exitcond47 = icmp ne i64 %indvars.iv.next46, 100 -; CHECK: br i1 %exitcond47, label %for.body, label %for.cond.cleanup - -; CHECK: for.cond.cleanup8: -; CHECK: %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 -; CHECK: %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000 -; CHECK: br i1 %exitcond44, label %for.cond6.preheader, label %for.body9.split - -; CHECK: for.body9: -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body9.split ], [ 1, %for.body9.preheader ] -; CHECK: br label %for.cond6.preheader.preheader - -; CHECK: for.body9.split1: -; CHECK: %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42 -; CHECK: store i32 %add, i32* %arrayidx13, align 4 -; CHECK: br label %for.cond.cleanup8 - -; CHECK: for.body9.split: -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %exitcond = icmp ne i64 %indvars.iv.next, 1000 -; CHECK: br i1 %exitcond, label %for.body9, label %for.cond.cleanup4 - - -;;-----------------------------------Test case 10------------------------------- -;; Test to make sure we can handle output dependencies. -;; -;; for (int i = 0; i < 2; ++i) -;; for(int j = 0; j < 3; ++j) { -;; A[j][i] = i; -;; A[j][i+1] = j; -;; } - -@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16 - -define void @interchange_10() { -entry: - br label %for.cond1.preheader - -for.cond.loopexit: ; preds = %for.body4 - %exitcond28 = icmp ne i64 %indvars.iv.next27, 2 - br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup - -for.cond1.preheader: ; preds = %for.cond.loopexit, %entry - %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ] - %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1 - br label %for.body4 - -for.cond.cleanup: ; preds = %for.cond.loopexit - ret void - -for.body4: ; preds = %for.body4, %for.cond1.preheader - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ] - %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26 - %tmp = trunc i64 %indvars.iv26 to i32 - store i32 %tmp, i32* %arrayidx6, align 4 - %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27 - %tmp1 = trunc i64 %indvars.iv to i32 - store i32 %tmp1, i32* %arrayidx10, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 3 - br i1 %exitcond, label %for.body4, label %for.cond.loopexit -} - -; CHECK-LABEL: @interchange_10 -; CHECK: entry: -; CHECK: br label %for.body4.preheader - -; CHECK: for.cond1.preheader.preheader: -; CHECK: br label %for.cond1.preheader - -; CHECK: for.cond.loopexit: -; CHECK: %exitcond28 = icmp ne i64 %indvars.iv.next27, 2 -; CHECK: br i1 %exitcond28, label %for.cond1.preheader, label %for.body4.split - -; CHECK: for.cond1.preheader: -; CHECK: %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond.loopexit ], [ 0, %for.cond1.preheader.preheader ] -; CHECK: %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1 -; CHECK: br label %for.body4.split1 - -; CHECK: for.body4.preheader: -; CHECK: br label %for.body4 - -; CHECK: for.cond.cleanup: -; CHECK: ret void - -; CHECK: for.body4: -; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.split ], [ 0, %for.body4.preheader ] -; CHECK: br label %for.cond1.preheader.preheader - -; CHECK: for.body4.split1: -; CHECK: %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26 -; CHECK: %tmp = trunc i64 %indvars.iv26 to i32 -; CHECK: store i32 %tmp, i32* %arrayidx6, align 4 -; CHECK: %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27 -; CHECK: %tmp1 = trunc i64 %indvars.iv to i32 -; CHECK: store i32 %tmp1, i32* %arrayidx10, align 4 -; CHECK: br label %for.cond.loopexit - -; CHECK: for.body4.split: -; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %exitcond = icmp ne i64 %indvars.iv.next, 3 -; CHECK: br i1 %exitcond, label %for.body4, label %for.cond.cleanup diff --git a/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll new file mode 100644 index 0000000000000..e14598cfdd609 --- /dev/null +++ b/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -0,0 +1,220 @@ +; Test optimization remarks generated by the LoopInterchange pass. +; +; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-output=%t -pass-remarks-missed='loop-interchange' \ +; RUN: -pass-remarks='loop-interchange' -S +; RUN: cat %t | FileCheck %s + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x [100 x i32]] zeroinitializer +@C = common global [100 x i32] zeroinitializer + +;;---------------------------------------Test case 01--------------------------------- +;; Loops interchange is not profitable. +;; for(int i=1;i<N;i++) +;; for(int j=1;j<N;j++) +;; A[i-1][j-1] = A[i - 1][j-1] + B[i][j]; + +define void @test01(i32 %N){ +entry: + %cmp31 = icmp sgt i32 %N, 1 + br i1 %cmp31, label %for.cond1.preheader.lr.ph, label %for.end19 + +for.cond1.preheader.lr.ph: + %0 = add i32 %N, -1 + br label %for.body3.lr.ph + +for.body3.lr.ph: + %indvars.iv34 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next35, %for.inc17 ] + %1 = add nsw i64 %indvars.iv34, -1 + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] + %2 = add nsw i64 %indvars.iv, -1 + %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %2 + %3 = load i32, i32* %arrayidx6 + %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv + %4 = load i32, i32* %arrayidx10 + %add = add nsw i32 %4, %3 + store i32 %add, i32* %arrayidx6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.inc17, label %for.body3 + +for.inc17: + %indvars.iv.next35 = add nuw nsw i64 %indvars.iv34, 1 + %lftr.wideiv37 = trunc i64 %indvars.iv34 to i32 + %exitcond38 = icmp eq i32 %lftr.wideiv37, %0 + br i1 %exitcond38, label %for.end19, label %for.body3.lr.ph + +for.end19: + ret void +} + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: InterchangeNotProfitable +; CHECK-NEXT: Function: test01 +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: 'Interchanging loops is too costly (cost=' +; CHECK-NEXT: - Cost: '2' +; CHECK-NEXT: - String: ', threshold=' +; CHECK-NEXT: - Threshold: '0' +; CHECK-NEXT: - String: ') and it does not improve parallelism.' +; CHECK-NEXT: ... + +;;--------------------------------------Test case 02------------------------------------ +;; [FIXME] This loop though valid is currently not interchanged due to the +;; limitation that we cannot split the inner loop latch due to multiple use of inner induction +;; variable.(used to increment the loop counter and to access A[j+1][i+1] +;; for(int i=0;i<N-1;i++) +;; for(int j=1;j<N-1;j++) +;; A[j+1][i+1] = A[j+1][i+1] + k; + +define void @test02(i32 %k, i32 %N) { + entry: + %sub = add nsw i32 %N, -1 + %cmp26 = icmp sgt i32 %N, 1 + br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end17 + + for.cond1.preheader.lr.ph: + %cmp324 = icmp sgt i32 %sub, 1 + %0 = add i32 %N, -2 + %1 = sext i32 %sub to i64 + br label %for.cond1.preheader + + for.cond.loopexit: + %cmp = icmp slt i64 %indvars.iv.next29, %1 + br i1 %cmp, label %for.cond1.preheader, label %for.end17 + + for.cond1.preheader: + %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.cond.loopexit ] + %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1 + br i1 %cmp324, label %for.body4, label %for.cond.loopexit + + for.body4: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.cond1.preheader ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29 + %2 = load i32, i32* %arrayidx7 + %add8 = add nsw i32 %2, %k + store i32 %add8, i32* %arrayidx7 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.cond.loopexit, label %for.body4 + + for.end17: + ret void +} + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: UnsupportedInsBetweenInduction +; CHECK-NEXT: Function: test02 +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Found unsupported instruction between induction variable increment and branch. +; CHECK-NEXT: ... + +;;-----------------------------------Test case 03------------------------------- +;; Test to make sure we can handle output dependencies. +;; +;; for (int i = 0; i < 2; ++i) +;; for(int j = 0; j < 3; ++j) { +;; A[j][i] = i; +;; A[j][i+1] = j; +;; } + +@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16 + +define void @test03() { +entry: + br label %for.cond1.preheader + +for.cond.loopexit: ; preds = %for.body4 + %exitcond28 = icmp ne i64 %indvars.iv.next27, 2 + br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup + +for.cond1.preheader: ; preds = %for.cond.loopexit, %entry + %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ] + %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1 + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.loopexit + ret void + +for.body4: ; preds = %for.body4, %for.cond1.preheader + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ] + %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26 + %tmp = trunc i64 %indvars.iv26 to i32 + store i32 %tmp, i32* %arrayidx6, align 4 + %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27 + %tmp1 = trunc i64 %indvars.iv to i32 + store i32 %tmp1, i32* %arrayidx10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, 3 + br i1 %exitcond, label %for.body4, label %for.cond.loopexit +} + +; CHECK: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: test03 +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +; CHECK-NEXT: ... + +;;--------------------------------------Test case 04------------------------------------- +;; Loops not tightly nested are not interchanged +;; for(int j=0;j<N;j++) { +;; B[j] = j+k; +;; for(int i=0;i<N;i++) +;; A[j][i] = A[j][i]+B[j]; +;; } + +define void @test04(i32 %k, i32 %N){ +entry: + %cmp30 = icmp sgt i32 %N, 0 + br i1 %cmp30, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: + %0 = add i32 %N, -1 + %1 = zext i32 %k to i64 + br label %for.body + +for.body: + %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ] + %2 = add nsw i64 %indvars.iv32, %1 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @C, i64 0, i64 %indvars.iv32 + %3 = trunc i64 %2 to i32 + store i32 %3, i32* %arrayidx + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv + %4 = load i32, i32* %arrayidx7 + %add10 = add nsw i32 %3, %4 + store i32 %add10, i32* %arrayidx7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.inc15, label %for.body3 + +for.inc15: + %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1 + %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32 + %exitcond36 = icmp eq i32 %lftr.wideiv35, %0 + br i1 %exitcond36, label %for.end17, label %for.body + +for.end17: + ret void +} + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: NotTightlyNested +; CHECK-NEXT: Function: test04 +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Cannot interchange loops because they are not tightly nested. +; CHECK-NEXT: ... diff --git a/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll b/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll new file mode 100644 index 0000000000000..cf4f83baea82b --- /dev/null +++ b/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll @@ -0,0 +1,64 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer + +;; Loops should not be interchanged in this case as it is not legal due to dependency. +;; for(int j=0;j<99;j++) +;; for(int i=0;i<99;i++) +;; A[j][i+1] = A[j+1][i]+k; + +define void @interchange_04(i32 %k){ +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] + %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv + %0 = load i32, i32* %arrayidx5 + %add6 = add nsw i32 %0, %k + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next + store i32 %add6, i32* %arrayidx11 + %exitcond = icmp eq i64 %indvars.iv.next, 99 + br i1 %exitcond, label %for.inc12, label %for.body3 + +for.inc12: + %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 + br i1 %exitcond25, label %for.end14, label %for.cond1.preheader + +for.end14: + ret void +} + +; CHECK-LABEL: @interchange_04 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: ; preds = %for.inc12, %entry +; CHECK: %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ] +; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1 +; CHECK: br label %for.body3 +; CHECK: for.body3: ; preds = %for.body3, %for.cond1.preheader +; CHECK: %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] +; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv +; CHECK: %0 = load i32, i32* %arrayidx5 +; CHECK: %add6 = add nsw i32 %0, %k +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next +; CHECK: store i32 %add6, i32* %arrayidx11 +; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 99 +; CHECK: br i1 %exitcond, label %for.inc12, label %for.body3 +; CHECK: for.inc12: ; preds = %for.body3 +; CHECK: %exitcond25 = icmp eq i64 %indvars.iv.next24, 99 +; CHECK: br i1 %exitcond25, label %for.end14, label %for.cond1.preheader +; CHECK: for.end14: ; preds = %for.inc12 +; CHECK: ret void diff --git a/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll b/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll new file mode 100644 index 0000000000000..1d4d22883a4f8 --- /dev/null +++ b/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll @@ -0,0 +1,87 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@D = common global [100 x [100 x [100 x i32]]] zeroinitializer + +;; Test for interchange in loop nest greater than 2. +;; for(int i=0;i<100;i++) +;; for(int j=0;j<100;j++) +;; for(int k=0;k<100;k++) +;; D[i][k][j] = D[i][k][j]+t; + +define void @interchange_08(i32 %t){ +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc15, %entry + %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ] + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader + %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ] + br label %for.body6 + +for.body6: ; preds = %for.body6, %for.cond4.preheader + %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ] + %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027 + %0 = load i32, i32* %arrayidx8 + %add = add nsw i32 %0, %t + store i32 %add, i32* %arrayidx8 + %inc = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc, 100 + br i1 %exitcond, label %for.inc12, label %for.body6 + +for.inc12: ; preds = %for.body6 + %inc13 = add nuw nsw i32 %j.027, 1 + %exitcond29 = icmp eq i32 %inc13, 100 + br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader + +for.inc15: ; preds = %for.inc12 + %inc16 = add nuw nsw i32 %i.028, 1 + %exitcond30 = icmp eq i32 %inc16, 100 + br i1 %exitcond30, label %for.end17, label %for.cond1.preheader + +for.end17: ; preds = %for.inc15 + ret void +} +; CHECK-LABEL: @interchange_08 +; CHECK: entry: +; CHECK: br label %for.cond1.preheader.preheader +; CHECK: for.cond1.preheader.preheader: ; preds = %entry +; CHECK: br label %for.cond1.preheader +; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc15 +; CHECK: %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ] +; CHECK: br label %for.body6.preheader +; CHECK: for.cond4.preheader.preheader: ; preds = %for.body6 +; CHECK: br label %for.cond4.preheader +; CHECK: for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc12 +; CHECK: %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ] +; CHECK: br label %for.body6.split1 +; CHECK: for.body6.preheader: ; preds = %for.cond1.preheader +; CHECK: br label %for.body6 +; CHECK: for.body6: ; preds = %for.body6.preheader, %for.body6.split +; CHECK: %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ] +; CHECK: br label %for.cond4.preheader.preheader +; CHECK: for.body6.split1: ; preds = %for.cond4.preheader +; CHECK: %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027 +; CHECK: %0 = load i32, i32* %arrayidx8 +; CHECK: %add = add nsw i32 %0, %t +; CHECK: store i32 %add, i32* %arrayidx8 +; CHECK: br label %for.inc12 +; CHECK: for.body6.split: ; preds = %for.inc12 +; CHECK: %inc = add nuw nsw i32 %k.026, 1 +; CHECK: %exitcond = icmp eq i32 %inc, 100 +; CHECK: br i1 %exitcond, label %for.inc15, label %for.body6 +; CHECK: for.inc12: ; preds = %for.body6.split1 +; CHECK: %inc13 = add nuw nsw i32 %j.027, 1 +; CHECK: %exitcond29 = icmp eq i32 %inc13, 100 +; CHECK: br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader +; CHECK: for.inc15: ; preds = %for.body6.split +; CHECK: %inc16 = add nuw nsw i32 %i.028, 1 +; CHECK: %exitcond30 = icmp eq i32 %inc16, 100 +; CHECK: br i1 %exitcond30, label %for.end17, label %for.cond1.preheader +; CHECK: for.end17: ; preds = %for.inc15 +; CHECK: ret void diff --git a/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll b/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll new file mode 100644 index 0000000000000..0cf91b09e65db --- /dev/null +++ b/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll @@ -0,0 +1,143 @@ +; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s +;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [100 x [100 x i32]] zeroinitializer +@B = common global [100 x i32] zeroinitializer +@C = common global [100 x [100 x i32]] zeroinitializer +@D = common global [100 x [100 x [100 x i32]]] zeroinitializer + +;; Loops not tightly nested are not interchanged +;; for(int j=0;j<N;j++) { +;; B[j] = j+k; +;; for(int i=0;i<N;i++) +;; A[j][i] = A[j][i]+B[j]; +;; } + +define void @interchange_05(i32 %k, i32 %N){ +entry: + %cmp30 = icmp sgt i32 %N, 0 + br i1 %cmp30, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: + %0 = add i32 %N, -1 + %1 = zext i32 %k to i64 + br label %for.body + +for.body: + %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ] + %2 = add nsw i64 %indvars.iv32, %1 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32 + %3 = trunc i64 %2 to i32 + store i32 %3, i32* %arrayidx + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv + %4 = load i32, i32* %arrayidx7 + %add10 = add nsw i32 %3, %4 + store i32 %add10, i32* %arrayidx7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.inc15, label %for.body3 + +for.inc15: + %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1 + %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32 + %exitcond36 = icmp eq i32 %lftr.wideiv35, %0 + br i1 %exitcond36, label %for.end17, label %for.body + +for.end17: + ret void +} + +; CHECK-LABEL: @interchange_05 +; CHECK: entry: +; CHECK: %cmp30 = icmp sgt i32 %N, 0 +; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17 +; CHECK: for.body.lr.ph: +; CHECK: %0 = add i32 %N, -1 +; CHECK: %1 = zext i32 %k to i64 +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ] +; CHECK: %2 = add nsw i64 %indvars.iv32, %1 +; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32 +; CHECK: %3 = trunc i64 %2 to i32 +; CHECK: store i32 %3, i32* %arrayidx +; CHECK: br label %for.body3.preheader +; CHECK: for.body3.preheader: +; CHECK: br label %for.body3 +; CHECK: for.body3: +; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ] +; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv +; CHECK: %4 = load i32, i32* %arrayidx7 +; CHECK: %add10 = add nsw i32 %3, %4 +; CHECK: store i32 %add10, i32* %arrayidx7 +; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32 +; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0 +; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3 +; CHECK: for.inc15: +; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1 +; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32 +; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0 +; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body +; CHECK: for.end17.loopexit: +; CHECK: br label %for.end17 +; CHECK: for.end17: +; CHECK: ret void + + +declare void @foo(...) + +;; Loops not tightly nested are not interchanged +;; for(int j=0;j<N;j++) { +;; foo(); +;; for(int i=2;i<N;i++) +;; A[j][i] = A[j][i]+k; +;; } + +define void @interchange_06(i32 %k, i32 %N) { +entry: + %cmp22 = icmp sgt i32 %N, 0 + br i1 %cmp22, label %for.body.lr.ph, label %for.end12 + +for.body.lr.ph: + %0 = add i32 %N, -1 + br label %for.body + +for.body: + %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ] + tail call void (...) @foo() + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ] + %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv + %1 = load i32, i32* %arrayidx5 + %add = add nsw i32 %1, %k + store i32 %add, i32* %arrayidx5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.inc10, label %for.body3 + +for.inc10: + %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1 + %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32 + %exitcond27 = icmp eq i32 %lftr.wideiv26, %0 + br i1 %exitcond27, label %for.end12, label %for.body + +for.end12: + ret void +} +;; Here we are checking if the inner phi is not split then we have not interchanged. +; CHECK-LABEL: @interchange_06 +; CHECK: phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ] +; CHECK-NEXT: getelementptr +; CHECK-NEXT: %1 = load diff --git a/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll b/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll new file mode 100644 index 0000000000000..6014775028ee5 --- /dev/null +++ b/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll @@ -0,0 +1,126 @@ +; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false -unroll-runtime-multi-exit=true -unroll-count=4 -verify-dom-info -S | FileCheck %s + +; REQUIRES: asserts +; The tests below are for verifying dom tree after runtime unrolling +; with multiple exit/exiting blocks. + +; We explicitly set the unroll count so that expensiveTripCount computation is allowed. + +; mergedexit block has edges from loop exit blocks. +define i64 @test1() { +; CHECK-LABEL: test1( +; CHECK-LABEL: headerexit: +; CHECK-NEXT: %addphi = phi i64 [ %add.iv, %header ], [ %add.iv.1, %header.1 ], [ %add.iv.2, %header.2 ], [ %add.iv.3, %header.3 ] +; CHECK-NEXT: br label %mergedexit +; CHECK-LABEL: latchexit: +; CHECK-NEXT: %shftphi = phi i64 [ %shft, %latch ], [ %shft.1, %latch.1 ], [ %shft.2, %latch.2 ], [ %shft.3, %latch.3 ] +; CHECK-NEXT: br label %mergedexit +; CHECK-LABEL: mergedexit: +; CHECK-NEXT: %retval = phi i64 [ %addphi, %headerexit ], [ %shftphi, %latchexit ] +; CHECK-NEXT: ret i64 %retval +entry: + br label %preheader + +preheader: ; preds = %bb + %trip = zext i32 undef to i64 + br label %header + +header: ; preds = %latch, %preheader + %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ] + %add.iv = add nuw nsw i64 %iv, 2 + %cmp1 = icmp ult i64 %add.iv, %trip + br i1 %cmp1, label %latch, label %headerexit + +latch: ; preds = %header + %shft = ashr i64 %add.iv, 1 + %cmp2 = icmp ult i64 %shft, %trip + br i1 %cmp2, label %header, label %latchexit + +headerexit: ; preds = %header + %addphi = phi i64 [ %add.iv, %header ] + br label %mergedexit + +latchexit: ; preds = %latch + %shftphi = phi i64 [ %shft, %latch ] + br label %mergedexit + +mergedexit: ; preds = %latchexit, %headerexit + %retval = phi i64 [ %addphi, %headerexit ], [ %shftphi, %latchexit ] + ret i64 %retval +} + +; mergedexit has edges from loop exit blocks and a block outside the loop. +define void @test2(i1 %cond, i32 %n) { +; CHECK-LABEL: header.1: +; CHECK-NEXT: %add.iv.1 = add nuw nsw i64 %add.iv, 2 +; CHECK: br i1 %cmp1.1, label %latch.1, label %headerexit +; CHECK-LABEL: latch.3: +; CHECK: %cmp2.3 = icmp ult i64 %shft.3, %trip +; CHECK-NEXT: br i1 %cmp2.3, label %header, label %latchexit, !llvm.loop +entry: + br i1 %cond, label %preheader, label %mergedexit + +preheader: ; preds = %entry + %trip = zext i32 %n to i64 + br label %header + +header: ; preds = %latch, %preheader + %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ] + %add.iv = add nuw nsw i64 %iv, 2 + %cmp1 = icmp ult i64 %add.iv, %trip + br i1 %cmp1, label %latch, label %headerexit + +latch: ; preds = %header + %shft = ashr i64 %add.iv, 1 + %cmp2 = icmp ult i64 %shft, %trip + br i1 %cmp2, label %header, label %latchexit + +headerexit: ; preds = %header + br label %mergedexit + +latchexit: ; preds = %latch + br label %mergedexit + +mergedexit: ; preds = %latchexit, %headerexit, %entry + ret void +} + + +; exitsucc is from loop exit block only. +define i64 @test3(i32 %n) { +; CHECK-LABEL: test3( +; CHECK-LABEL: headerexit: +; CHECK-NEXT: br label %exitsucc +; CHECK-LABEL: latchexit: +; CHECK-NEXT: %shftphi = phi i64 [ %shft, %latch ], [ %shft.1, %latch.1 ], [ %shft.2, %latch.2 ], [ %shft.3, %latch.3 ] +; CHECK-NEXT: ret i64 %shftphi +; CHECK-LABEL: exitsucc: +; CHECK-NEXT: ret i64 96 +entry: + br label %preheader + +preheader: ; preds = %bb + %trip = zext i32 %n to i64 + br label %header + +header: ; preds = %latch, %preheader + %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ] + %add.iv = add nuw nsw i64 %iv, 2 + %cmp1 = icmp ult i64 %add.iv, %trip + br i1 %cmp1, label %latch, label %headerexit + +latch: ; preds = %header + %shft = ashr i64 %add.iv, 1 + %cmp2 = icmp ult i64 %shft, %trip + br i1 %cmp2, label %header, label %latchexit + +headerexit: ; preds = %header + br label %exitsucc + +latchexit: ; preds = %latch + %shftphi = phi i64 [ %shft, %latch ] + ret i64 %shftphi + +exitsucc: ; preds = %headerexit + ret i64 96 +} diff --git a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index b5e914500fb4a..31c564779fb24 100644 --- a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -86,10 +86,10 @@ for.end: ; preds = %for.end.loopexit, % ; AUTO_VEC-NEXT: entry: ; AUTO_VEC-NEXT: [[TMP0:%.*]] = icmp sgt i64 %n, 1 ; AUTO_VEC-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1 -; AUTO_VEC: br i1 {{.*}}, label %for.body, label %min.iters.checked -; AUTO_VEC: min.iters.checked: +; AUTO_VEC: br i1 {{.*}}, label %for.body, label %vector.ph +; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 -; AUTO_VEC: br i1 {{.*}}, label %for.body, label %vector.body +; AUTO_VEC: br label %vector.body ; AUTO_VEC: middle.block: ; AUTO_VEC: [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1 ; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll index 49d88323523c1..f2d68fb4e62bd 100644 --- a/test/Transforms/LoopVectorize/debugloc.ll +++ b/test/Transforms/LoopVectorize/debugloc.ll @@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Make sure we are preserving debug info in the vectorized code. ; CHECK: for.body.lr.ph -; CHECK: cmp.zero = icmp eq i64 {{.*}}, 0, !dbg !{{[0-9]+}} +; CHECK: min.iters.check = icmp ult i64 {{.*}}, 2, !dbg !{{[0-9]+}} ; CHECK: vector.body ; CHECK: index {{.*}}, !dbg ![[LOC:[0-9]+]] ; CHECK: getelementptr inbounds i32, i32* %a, {{.*}}, !dbg ![[LOC]] diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll index 0ff94c1450acf..508938958d59a 100644 --- a/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ] +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -79,7 +79,7 @@ for.exit: ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ] +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -144,7 +144,7 @@ scalar.body: ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ] +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -288,7 +288,7 @@ for.cond.cleanup3: ; UNROLL-NO-IC-LABEL: @PR30183( ; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3 +; UNROLL-NO-IC: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3 ; UNROLL-NO-IC-NEXT: br label %vector.body ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll index 8eec6e262c1a1..a7cc4530ceb39 100644 --- a/test/Transforms/LoopVectorize/float-induction.ll +++ b/test/Transforms/LoopVectorize/float-induction.ll @@ -15,7 +15,7 @@ ; VEC4_INTERL1-LABEL: @fp_iv_loop1( ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer @@ -37,7 +37,7 @@ ; VEC4_INTERL2-LABEL: @fp_iv_loop1( ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL2: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer @@ -63,7 +63,7 @@ ; VEC1_INTERL2-LABEL: @fp_iv_loop1( ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: br label %vector.body +; VEC1_INTERL2: br label %vector.body ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 @@ -115,7 +115,7 @@ for.end: ; preds = %for.end.loopexit, % ; VEC4_INTERL1-LABEL: @fp_iv_loop2( ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00> ; VEC4_INTERL1-NEXT: br label %vector.body @@ -172,7 +172,7 @@ for.end: ; preds = %for.end.loopexit, % ; VEC4_INTERL1: for.body.lr.ph: ; VEC4_INTERL1: [[TMP0:%.*]] = load float, float* @fp_inc, align 4 ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 +; VEC4_INTERL1: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer @@ -250,7 +250,7 @@ for.end: ; VEC4_INTERL1-LABEL: @fp_iv_loop4( ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: br label %vector.body +; VEC4_INTERL1: br label %vector.body ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] @@ -289,7 +289,7 @@ for.end: ; preds = %for.end.loopexit, % ; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar( ; VEC2_INTERL1_PRED_STORE: vector.body: -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>* diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll index 7f381ae6ad7b5..0d6e4b1e61b44 100644 --- a/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -13,24 +13,21 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[MIN_ITERS_CHECKED:%.*]] -; CHECK: min.iters.checked: -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[CMP_ZERO:%.*]] = icmp eq i64 [[N_VEC]], 0 -; CHECK-NEXT: br i1 [[CMP_ZERO]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] ; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -55,10 +52,10 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[MIN_ITERS_CHECKED]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll index 33e8ed067160d..b37537efcc513 100644 --- a/test/Transforms/LoopVectorize/induction-step.ll +++ b/test/Transforms/LoopVectorize/induction-step.ll @@ -15,7 +15,7 @@ ; CHECK: for.body.lr.ph: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @int_inc, align 4 ; CHECK: vector.ph: -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0 +; CHECK: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer @@ -86,7 +86,7 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK-LABEL: @induction_with_loop_inv( ; CHECK: vector.ph: -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0 +; CHECK: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0 ; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 7e9e6b1cdc8e3..d77806da59bed 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -501,13 +501,13 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; condition and branch directly to the scalar loop. ; CHECK-LABEL: max_i32_backedgetaken -; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked +; CHECK: br i1 true, label %scalar.ph, label %vector.ph ; CHECK: middle.block: ; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0 ; CHECK: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ] -; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ] +; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ] define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index 1e8b982363d80..89c0ac1091676 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; ; CHECK-LABEL: @interleaved_with_cond_store_0( ; -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf @@ -58,7 +58,7 @@ for.end: ; ; CHECK-LABEL: @interleaved_with_cond_store_1( ; -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf @@ -117,7 +117,7 @@ for.end: ; ; CHECK-LABEL: @interleaved_with_cond_store_2( ; -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll index d84dc42bdf543..530c2f66552af 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -338,7 +338,7 @@ for.body: ; preds = %for.body, %entry ; } ; CHECK-LABEL: @even_load_dynamic_tc( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -579,7 +579,7 @@ for.body: ; preds = %for.body, %entry ; } ; CHECK-LABEL: @PR27626_0( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -627,7 +627,7 @@ for.end: ; } ; CHECK-LABEL: @PR27626_1( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -680,7 +680,7 @@ for.end: ; } ; CHECK-LABEL: @PR27626_2( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf @@ -728,7 +728,7 @@ for.end: ; } ; CHECK-LABEL: @PR27626_3( -; CHECK: min.iters.checked: +; CHECK: vector.ph: ; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 ; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 ; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf diff --git a/test/Transforms/LoopVectorize/iv_outside_user.ll b/test/Transforms/LoopVectorize/iv_outside_user.ll index 8a44af96e7f4b..265188886996b 100644 --- a/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -135,7 +135,7 @@ for.end: } ; CHECK-LABEL: @PR30742 -; CHECK: min.iters.checked +; CHECK: vector.ph ; CHECK: %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2 ; CHECK: %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]] ; CHECK: middle.block diff --git a/test/Transforms/LoopVectorize/miniters.ll b/test/Transforms/LoopVectorize/miniters.ll index 1cb67f9150ac2..f5f4eb5eaa01c 100644 --- a/test/Transforms/LoopVectorize/miniters.ll +++ b/test/Transforms/LoopVectorize/miniters.ll @@ -10,10 +10,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF. ; CHECK-LABEL: foo( ; CHECK: %min.iters.check = icmp ult i64 %N, 4 -; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph ; UNROLL-LABEL: foo( ; UNROLL: %min.iters.check = icmp ult i64 %N, 8 -; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph define void @foo(i64 %N) { entry: diff --git a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll new file mode 100644 index 0000000000000..40af8f3adf029 --- /dev/null +++ b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll @@ -0,0 +1,240 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check that the vectorizer identifies the %p.09 phi, +; as an induction variable, despite the potential overflow +; due to the truncation from 32bit to 8bit. +; SCEV will detect the pattern "sext(trunc(%p.09)) + %step" +; and generate the required runtime checks under which +; we can assume no overflow. We check here that we generate +; exactly two runtime checks: +; 1) an overflow check: +; {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nssw> +; 2) an equality check verifying that the step of the induction +; is equal to sext(trunc(step)): +; Equal predicate: %step == (sext i8 (trunc i32 %step to i8) to i32) +; +; See also pr30654. +; +; int a[N]; +; void doit1(int n, int step) { +; int i; +; char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; } +; } +; + +; CHECK-LABEL: @doit1 +; CHECK: vector.scevcheck +; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow +; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]] +; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}} +; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: vector.body: +; CHECK: <4 x i32> + +@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind uwtable +define void @doit1(i32 %n, i32 %step) local_unnamed_addr { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %sext = shl i32 %p.09, 24 + %conv = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Same as above, but for checking the SCEV "zext(trunc(%p.09)) + %step". +; Here we expect the following two predicates to be added for runtime checking: +; 1) {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nusw> +; 2) Equal predicate: %step == (zext i8 (trunc i32 %step to i8) to i32) +; +; int a[N]; +; void doit2(int n, int step) { +; int i; +; unsigned char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; } +; } +; + +; CHECK-LABEL: @doit2 +; CHECK: vector.scevcheck +; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow +; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]] +; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}} +; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: vector.body: +; CHECK: <4 x i32> + +; Function Attrs: norecurse nounwind uwtable +define void @doit2(i32 %n, i32 %step) local_unnamed_addr { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %conv = and i32 %p.09, 255 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; Here we check that the same phi scev analysis would fail +; to create the runtime checks because the step is not invariant. +; As a result vectorization will fail. +; +; int a[N]; +; void doit3(int n, int step) { +; int i; +; char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; step += 2; +; } +; } +; + +; CHECK-LABEL: @doit3 +; CHECK-NOT: vector.scevcheck +; CHECK-NOT: vector.body: +; CHECK-LABEL: for.body: + +; Function Attrs: norecurse nounwind uwtable +define void @doit3(i32 %n, i32 %step) local_unnamed_addr { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.012 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ] + %sext = shl i32 %p.012, 24 + %conv = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step.addr.010 + %add3 = add nsw i32 %step.addr.010, 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + + +; Lastly, we also check the case where we can tell at compile time that +; the step of the induction is equal to sext(trunc(step)), in which case +; we don't have to check this equality at runtime (we only need the +; runtime overflow check). Therefore only the following overflow predicate +; will be added for runtime checking: +; {0,+,%cstep}<%for.body> Added Flags: <nssw> +; +; a[N]; +; void doit4(int n, char cstep) { +; int i; +; char p = 0; +; int istep = cstep; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + istep; +; } +; } + +; CHECK-LABEL: @doit4 +; CHECK: vector.scevcheck +; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: %{{.*}} = or i1 {{.*}}, %mul.overflow +; CHECK-NOT: %ident.check = icmp ne i32 {{.*}}, %{{.*}} +; CHECK-NOT: %{{.*}} = or i1 %{{.*}}, %ident.check +; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}}) +; CHECK: vector.body: +; CHECK: <4 x i32> + +; Function Attrs: norecurse nounwind uwtable +define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr { +entry: + %conv = sext i8 %cstep to i32 + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %sext = shl i32 %p.011, 24 + %conv2 = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv2, i32* %arrayidx, align 4 + %add = add nsw i32 %conv2, %conv + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll index ac1145aab67b0..b37d94c0c328c 100644 --- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll +++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll @@ -4,7 +4,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: @add_ints( ;CHECK: br -;CHECK: br ;CHECK: getelementptr ;CHECK-DAG: getelementptr ;CHECK-DAG: icmp ugt diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll index 958b3c135c976..fb05486127156 100644 --- a/test/Transforms/LoopVectorize/runtime-check.ll +++ b/test/Transforms/LoopVectorize/runtime-check.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ;CHECK-LABEL: define i32 @foo ;CHECK: for.body.preheader: -;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]] +;CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]] ;CHECK: vector.memcheck: ;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]] ;CHECK: load <4 x float> |