diff options
Diffstat (limited to 'test/Transforms/InstCombine/vec_demanded_elts.ll')
-rw-r--r-- | test/Transforms/InstCombine/vec_demanded_elts.ll | 359 |
1 files changed, 11 insertions, 348 deletions
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index 4245c7a3c134..0b9663300c39 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -138,22 +138,6 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) -; <rdar://problem/6945110> -define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind { -entry: - %tmp = load <4 x i16>, <4 x i16>* %src - %tmp1 = load <8 x i16>, <8 x i16>* %foo -; CHECK: %tmp2 = shufflevector - %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> -; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle: -; CHECK-NOT: shufflevector - %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> -; CHECK-NEXT: pmovzxwd - %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3) - ret <4 x i32> %0 -} -declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone - define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { entry: ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt( @@ -210,130 +194,6 @@ define <4 x float> @test_select(float %f, float %g) { ret <4 x float> %ret } -; We should optimize these two redundant insertqi into one -; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { -; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) -; CHECK-NOT: insertqi - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) - ret <2 x i64> %2 -} - -; The result of this insert is the second arg, since the top 64 bits of -; the result are undefined, and we copy the bottom 64 bits from the -; second arg -; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> %i - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) - ret <2 x i64> %1 -} - -; Test the several types of ranges and ordering that exist for two insertqi -; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) -; CHECK: ret <2 x i64> %[[RES]] - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) -; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) - %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) - ret <2 x i64> %2 -} - -; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> %i - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0) - ret <2 x i64> %1 -} - -; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16) - ret <2 x i64> %1 -} - -; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32) - ret <2 x i64> %1 -} - -; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) -define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) { -; CHECK: ret <2 x i64> undef - %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16) - ret <2 x i64> %1 -} - -; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi -declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind - declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { ; CHECK-LABEL: @test_vpermilvar_ps( @@ -394,212 +254,15 @@ define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) { ret <4 x double> %a } -define <2 x i64> @test_sse2_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_1 -; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624> -} - -define <4 x i64> @test_avx2_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_1 -; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256> -} - -define <2 x i64> @test_sse2_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_0 -; CHECK: ret <2 x i64> zeroinitializer -} - -define <4 x i64> @test_avx2_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_0 -; CHECK: ret <4 x i64> zeroinitializer -} -define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_psrl_1 -; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020> -} - -define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable { - %S = bitcast i32 1 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_psrl_1 -; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128> -} - -define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4) - %6 = bitcast <8 x i16> %5 to <4 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7) - %9 = bitcast <4 x i32> %8 to <2 x i64> - %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3) - %11 = bitcast <2 x i64> %10 to <8 x i16> - %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S) - %13 = bitcast <8 x i16> %12 to <4 x i32> - %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S) - %15 = bitcast <4 x i32> %14 to <2 x i64> - %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S) - ret <2 x i64> %16 -; CHECK: test_sse2_psrl_0 -; CHECK: ret <2 x i64> zeroinitializer -} - -define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable { - %S = bitcast i32 128 to i32 - %1 = zext i32 %S to i64 - %2 = insertelement <2 x i64> undef, i64 %1, i32 0 - %3 = insertelement <2 x i64> %2, i64 0, i32 1 - %4 = bitcast <2 x i64> %3 to <8 x i16> - %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4) - %6 = bitcast <16 x i16> %5 to <8 x i32> - %7 = bitcast <2 x i64> %3 to <4 x i32> - %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7) - %9 = bitcast <8 x i32> %8 to <4 x i64> - %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3) - %11 = bitcast <4 x i64> %10 to <16 x i16> - %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S) - %13 = bitcast <16 x i16> %12 to <8 x i32> - %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S) - %15 = bitcast <8 x i32> %14 to <4 x i64> - %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S) - ret <4 x i64> %16 -; CHECK: test_avx2_psrl_0 -; CHECK: ret <4 x i64> zeroinitializer +define <2 x i64> @PR24922(<2 x i64> %v) { +; CHECK-LABEL: @PR24922 +; CHECK: select <2 x i1> +; +; Check that instcombine doesn't wrongly fold the select statement into a +; ret <2 x i64> %v +; +; FIXME: We should be able to simplify the ConstantExpr in the select mask. +entry: + %result = select <2 x i1> <i1 icmp eq (i64 extractelement (<2 x i64> bitcast (<4 x i32> <i32 15, i32 15, i32 15, i32 15> to <2 x i64>), i64 0), i64 0), i1 true>, <2 x i64> %v, <2 x i64> zeroinitializer + ret <2 x i64> %result } - -declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 -declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1 -declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1 -declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1 -declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 -declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 -declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 -declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 -declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 -declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1 -declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1 -declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1 -declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1 -declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1 -declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1 -declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 -declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 -declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 - -attributes #1 = { nounwind readnone } |