aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Turner <andrew@FreeBSD.org>2025-01-10 10:34:52 +0000
committerAndrew Turner <andrew@FreeBSD.org>2025-01-10 10:39:34 +0000
commit9d1de25930735261c16ed874a933b4c1f1d9041e (patch)
treeb0cac1c933cc1ecb885c7e757b89ffbf13f1f012
parentedc5c0de794f521eb620d2b6cbaee2434442a8f3 (diff)
-rw-r--r--MAINTAINERS9
-rw-r--r--Makefile16
-rw-r--r--README33
-rw-r--r--config.mk.dist99
-rw-r--r--math/Dir.mk253
-rw-r--r--math/README.contributors5
-rw-r--r--math/aarch64/advsimd/acos.c (renamed from pl/math/v_acos_2u.c)30
-rw-r--r--math/aarch64/advsimd/acosf.c (renamed from pl/math/v_acosf_1u4.c)32
-rw-r--r--math/aarch64/advsimd/acosh.c (renamed from pl/math/v_acosh_3u5.c)27
-rw-r--r--math/aarch64/advsimd/acoshf.c (renamed from pl/math/v_acoshf_3u1.c)62
-rw-r--r--math/aarch64/advsimd/asin.c (renamed from pl/math/v_asin_3u.c)75
-rw-r--r--math/aarch64/advsimd/asinf.c (renamed from pl/math/v_asinf_2u5.c)30
-rw-r--r--math/aarch64/advsimd/asinh.c242
-rw-r--r--math/aarch64/advsimd/asinhf.c89
-rw-r--r--math/aarch64/advsimd/atan.c (renamed from pl/math/v_atan_2u5.c)85
-rw-r--r--math/aarch64/advsimd/atan2.c171
-rw-r--r--math/aarch64/advsimd/atan2f.c (renamed from pl/math/v_atan2f_3u.c)84
-rw-r--r--math/aarch64/advsimd/atanf.c (renamed from pl/math/v_atanf_3u.c)26
-rw-r--r--math/aarch64/advsimd/atanh.c (renamed from pl/math/v_atanh_3u5.c)45
-rw-r--r--math/aarch64/advsimd/atanhf.c (renamed from pl/math/v_atanhf_3u1.c)49
-rw-r--r--math/aarch64/advsimd/cbrt.c (renamed from pl/math/v_cbrt_2u.c)43
-rw-r--r--math/aarch64/advsimd/cbrtf.c (renamed from pl/math/v_cbrtf_1u7.c)19
-rw-r--r--math/aarch64/advsimd/cexpi.c (renamed from pl/math/v_cexpi_3u5.c)14
-rw-r--r--math/aarch64/advsimd/cexpif.c (renamed from pl/math/v_cexpif_1u8.c)14
-rw-r--r--math/aarch64/advsimd/cos.c (renamed from math/aarch64/v_cos.c)21
-rw-r--r--math/aarch64/advsimd/cosf.c (renamed from math/aarch64/v_cosf.c)23
-rw-r--r--math/aarch64/advsimd/cosh.c (renamed from pl/math/v_cosh_2u.c)25
-rw-r--r--math/aarch64/advsimd/coshf.c (renamed from pl/math/v_coshf_2u4.c)52
-rw-r--r--math/aarch64/advsimd/cospi.c (renamed from pl/math/v_cospi_3u1.c)25
-rw-r--r--math/aarch64/advsimd/cospif.c (renamed from pl/math/v_cospif_3u2.c)29
-rw-r--r--math/aarch64/advsimd/erf.c (renamed from pl/math/v_erf_2u5.c)48
-rw-r--r--math/aarch64/advsimd/erfc.c (renamed from pl/math/v_erfc_1u8.c)65
-rw-r--r--math/aarch64/advsimd/erfcf.c (renamed from pl/math/v_erfcf_1u7.c)54
-rw-r--r--math/aarch64/advsimd/erff.c (renamed from pl/math/v_erff_2u.c)34
-rw-r--r--math/aarch64/advsimd/exp.c (renamed from math/aarch64/v_exp.c)11
-rw-r--r--math/aarch64/advsimd/exp10.c (renamed from pl/math/v_exp10_2u.c)23
-rw-r--r--math/aarch64/advsimd/exp10f.c (renamed from pl/math/v_exp10f_2u4.c)87
-rw-r--r--math/aarch64/advsimd/exp2.c (renamed from pl/math/v_exp2_2u.c)28
-rw-r--r--math/aarch64/advsimd/exp2f.c (renamed from math/aarch64/v_exp2f.c)69
-rw-r--r--math/aarch64/advsimd/exp2f_1u.c73
-rw-r--r--math/aarch64/advsimd/expf.c (renamed from math/aarch64/v_expf.c)72
-rw-r--r--math/aarch64/advsimd/expf_1u.c79
-rw-r--r--math/aarch64/advsimd/expm1.c77
-rw-r--r--math/aarch64/advsimd/expm1f.c82
-rw-r--r--math/aarch64/advsimd/finite_pow.h (renamed from pl/math/finite_pow.h)22
-rw-r--r--math/aarch64/advsimd/hypot.c (renamed from pl/math/v_hypot_1u5.c)30
-rw-r--r--math/aarch64/advsimd/hypotf.c (renamed from pl/math/v_hypotf_1u5.c)36
-rw-r--r--math/aarch64/advsimd/log.c118
-rw-r--r--math/aarch64/advsimd/log10.c132
-rw-r--r--math/aarch64/advsimd/log10f.c106
-rw-r--r--math/aarch64/advsimd/log1p.c61
-rw-r--r--math/aarch64/advsimd/log1pf.c92
-rw-r--r--math/aarch64/advsimd/log2.c123
-rw-r--r--math/aarch64/advsimd/log2f.c102
-rw-r--r--math/aarch64/advsimd/logf.c88
-rw-r--r--math/aarch64/advsimd/modf.c33
-rw-r--r--math/aarch64/advsimd/modff.c34
-rw-r--r--math/aarch64/advsimd/pow.c (renamed from pl/math/v_pow_1u5.c)195
-rw-r--r--math/aarch64/advsimd/powf.c209
-rw-r--r--math/aarch64/advsimd/sin.c (renamed from math/aarch64/v_sin.c)26
-rw-r--r--math/aarch64/advsimd/sincos.c (renamed from pl/math/v_sincos_3u5.c)30
-rw-r--r--math/aarch64/advsimd/sincosf.c (renamed from pl/math/v_sincosf_1u8.c)30
-rw-r--r--math/aarch64/advsimd/sincospi.c44
-rw-r--r--math/aarch64/advsimd/sincospif.c43
-rw-r--r--math/aarch64/advsimd/sinf.c (renamed from math/aarch64/v_sinf.c)36
-rw-r--r--math/aarch64/advsimd/sinh.c80
-rw-r--r--math/aarch64/advsimd/sinhf.c (renamed from pl/math/v_sinhf_2u3.c)46
-rw-r--r--math/aarch64/advsimd/sinpi.c (renamed from pl/math/v_sinpi_3u1.c)25
-rw-r--r--math/aarch64/advsimd/sinpif.c (renamed from pl/math/v_sinpif_3u.c)29
-rw-r--r--math/aarch64/advsimd/tan.c (renamed from pl/math/v_tan_3u5.c)28
-rw-r--r--math/aarch64/advsimd/tanf.c (renamed from pl/math/v_tanf_3u5.c)35
-rw-r--r--math/aarch64/advsimd/tanh.c67
-rw-r--r--math/aarch64/advsimd/tanhf.c (renamed from pl/math/v_tanhf_2u6.c)44
-rw-r--r--math/aarch64/advsimd/tanpi.c88
-rw-r--r--math/aarch64/advsimd/tanpif.c70
-rw-r--r--math/aarch64/advsimd/v_expf_inline.h58
-rw-r--r--math/aarch64/advsimd/v_expm1_inline.h86
-rw-r--r--math/aarch64/advsimd/v_expm1f_inline.h62
-rw-r--r--math/aarch64/advsimd/v_log1p_inline.h119
-rw-r--r--math/aarch64/advsimd/v_log1pf_inline.h94
-rw-r--r--math/aarch64/advsimd/v_log_inline.h (renamed from pl/math/v_log_inline.h)6
-rw-r--r--math/aarch64/advsimd/v_math.h (renamed from pl/math/v_math.h)91
-rw-r--r--math/aarch64/advsimd/v_poly_f32.h (renamed from pl/math/poly_advsimd_f32.h)6
-rw-r--r--math/aarch64/advsimd/v_poly_f64.h (renamed from pl/math/poly_advsimd_f64.h)6
-rw-r--r--math/aarch64/advsimd/v_sincos_common.h (renamed from pl/math/v_sincos_common.h)4
-rw-r--r--math/aarch64/advsimd/v_sincosf_common.h (renamed from pl/math/v_sincosf_common.h)2
-rw-r--r--math/aarch64/advsimd/v_sincospi_common.h64
-rw-r--r--math/aarch64/advsimd/v_sincospif_common.h57
-rw-r--r--math/aarch64/cospi_3u5.c (renamed from pl/math/cospi_3u1.c)31
-rw-r--r--math/aarch64/cospif_2u6.c (renamed from pl/math/cospif_2u6.c)31
-rw-r--r--math/aarch64/experimental/README.contributors (renamed from pl/README.contributors)7
-rw-r--r--math/aarch64/experimental/acos_2u.c (renamed from pl/math/acos_2u.c)44
-rw-r--r--math/aarch64/experimental/acosf_1u4.c (renamed from pl/math/acosf_1u4.c)40
-rw-r--r--math/aarch64/experimental/acosh_3u.c (renamed from pl/math/acosh_3u.c)35
-rw-r--r--math/aarch64/experimental/acoshf_2u8.c (renamed from pl/math/acoshf_2u8.c)32
-rw-r--r--math/aarch64/experimental/advsimd/erfinv_25u.c (renamed from pl/math/v_erfinv_25u.c)35
-rw-r--r--math/aarch64/experimental/advsimd/erfinvf_5u.c (renamed from pl/math/v_erfinvf_5u.c)49
-rw-r--r--math/aarch64/experimental/advsimd/v_logf_inline.h (renamed from pl/math/v_logf_inline.h)2
-rw-r--r--math/aarch64/experimental/asin_3u.c (renamed from pl/math/asin_3u.c)40
-rw-r--r--math/aarch64/experimental/asin_data.c (renamed from pl/math/asin_data.c)2
-rw-r--r--math/aarch64/experimental/asinf_2u5.c (renamed from pl/math/asinf_2u5.c)36
-rw-r--r--math/aarch64/experimental/asinf_data.c (renamed from pl/math/asinf_data.c)2
-rw-r--r--math/aarch64/experimental/asinh_2u5.c (renamed from pl/math/asinh_2u5.c)33
-rw-r--r--math/aarch64/experimental/asinh_data.c (renamed from pl/math/asinh_data.c)17
-rw-r--r--math/aarch64/experimental/asinhf_3u5.c (renamed from pl/math/asinhf_3u5.c)25
-rw-r--r--math/aarch64/experimental/asinhf_data.c15
-rw-r--r--math/aarch64/experimental/atan2_2u5.c (renamed from pl/math/atan2_2u5.c)24
-rw-r--r--math/aarch64/experimental/atan2f_3u.c (renamed from pl/math/atan2f_3u.c)24
-rw-r--r--math/aarch64/experimental/atan_2u5.c (renamed from pl/math/atan_2u5.c)22
-rw-r--r--math/aarch64/experimental/atan_common.h (renamed from pl/math/atan_common.h)2
-rw-r--r--math/aarch64/experimental/atan_data.c23
-rw-r--r--math/aarch64/experimental/atanf_2u9.c (renamed from pl/math/atanf_2u9.c)18
-rw-r--r--math/aarch64/experimental/atanf_common.h (renamed from pl/math/atanf_common.h)2
-rw-r--r--math/aarch64/experimental/atanf_data.c17
-rw-r--r--math/aarch64/experimental/atanh_3u.c (renamed from pl/math/atanh_3u.c)18
-rw-r--r--math/aarch64/experimental/atanhf_3u1.c (renamed from pl/math/atanhf_3u1.c)16
-rw-r--r--math/aarch64/experimental/cbrt_2u.c (renamed from pl/math/cbrt_2u.c)16
-rw-r--r--math/aarch64/experimental/cbrt_data.c (renamed from pl/math/cbrt_data.c)2
-rw-r--r--math/aarch64/experimental/cbrtf_1u5.c (renamed from pl/math/cbrtf_1u5.c)16
-rw-r--r--math/aarch64/experimental/cbrtf_data.c (renamed from pl/math/cbrtf_data.c)2
-rw-r--r--math/aarch64/experimental/cosh_2u.c (renamed from pl/math/cosh_2u.c)34
-rw-r--r--math/aarch64/experimental/coshf_1u9.c (renamed from pl/math/coshf_1u9.c)29
-rw-r--r--math/aarch64/experimental/erf_2u5.c (renamed from pl/math/erf_2u5.c)21
-rw-r--r--math/aarch64/experimental/erfc_1u8.c (renamed from pl/math/erfc_1u8.c)26
-rw-r--r--math/aarch64/experimental/erfcf_1u7.c (renamed from pl/math/erfcf_1u7.c)24
-rw-r--r--math/aarch64/experimental/erff_2u.c (renamed from pl/math/erff_2u.c)21
-rw-r--r--math/aarch64/experimental/erfinv_24u5.c (renamed from pl/math/erfinv_24u5.c)20
-rw-r--r--math/aarch64/experimental/erfinvf_4u7.c (renamed from pl/math/erfinvf_4u7.c)16
-rw-r--r--math/aarch64/experimental/erfinvl.c (renamed from pl/math/erfinvl.c)2
-rw-r--r--math/aarch64/experimental/exp_inline.h (renamed from pl/math/exp.c)22
-rw-r--r--math/aarch64/experimental/expf_data.c (renamed from pl/math/expf_data.c)4
-rw-r--r--math/aarch64/experimental/expm1_2u5.c (renamed from pl/math/expm1_2u5.c)20
-rw-r--r--math/aarch64/experimental/expm1_data.c21
-rw-r--r--math/aarch64/experimental/expm1f_1u6.c (renamed from pl/math/expm1f_1u6.c)24
-rw-r--r--math/aarch64/experimental/expm1f_data.c (renamed from pl/math/expm1f_data.c)6
-rw-r--r--math/aarch64/experimental/log10_2u.c (renamed from pl/math/log10_2u.c)33
-rw-r--r--math/aarch64/experimental/log10_data.c (renamed from pl/math/log10_data.c)4
-rw-r--r--math/aarch64/experimental/log1p_2u.c (renamed from pl/math/log1p_2u.c)20
-rw-r--r--math/aarch64/experimental/log1p_data.c20
-rw-r--r--math/aarch64/experimental/log1pf_2u1.c (renamed from pl/math/log1pf_2u1.c)18
-rw-r--r--math/aarch64/experimental/log1pf_data.c (renamed from pl/math/log1pf_data.c)8
-rw-r--r--math/aarch64/experimental/sinh_3u.c (renamed from pl/math/sinh_3u.c)27
-rw-r--r--math/aarch64/experimental/sinhf_2u3.c (renamed from pl/math/sinhf_2u3.c)32
-rw-r--r--math/aarch64/experimental/sve/erfinv_25u.c156
-rw-r--r--math/aarch64/experimental/sve/erfinvf_5u.c156
-rw-r--r--math/aarch64/experimental/sve/powi.c (renamed from pl/math/sv_powi.c)3
-rw-r--r--math/aarch64/experimental/sve/powif.c (renamed from pl/math/sv_powif.c)3
-rw-r--r--math/aarch64/experimental/sve/sv_logf_inline.h51
-rw-r--r--math/aarch64/experimental/tanf_3u3.c (renamed from pl/math/tanf_3u3.c)42
-rw-r--r--math/aarch64/experimental/tanf_data.c (renamed from pl/math/tanf_data.c)2
-rw-r--r--math/aarch64/experimental/tanh_3u.c (renamed from pl/math/tanh_3u.c)22
-rw-r--r--math/aarch64/experimental/tanhf_2u6.c (renamed from pl/math/tanhf_2u6.c)25
-rw-r--r--math/aarch64/sincospi_4u.c158
-rw-r--r--math/aarch64/sincospif_3u2.c145
-rw-r--r--math/aarch64/sinpi_3u5.c (renamed from pl/math/sinpi_3u.c)39
-rw-r--r--math/aarch64/sinpif_2u5.c (renamed from pl/math/sinpif_2u5.c)35
-rw-r--r--math/aarch64/sve/acos.c (renamed from pl/math/sv_acos_2u.c)24
-rw-r--r--math/aarch64/sve/acosf.c (renamed from pl/math/sv_acosf_1u4.c)24
-rw-r--r--math/aarch64/sve/acosh.c51
-rw-r--r--math/aarch64/sve/acoshf.c51
-rw-r--r--math/aarch64/sve/asin.c (renamed from pl/math/sv_asin_3u.c)28
-rw-r--r--math/aarch64/sve/asinf.c (renamed from pl/math/sv_asinf_2u5.c)24
-rw-r--r--math/aarch64/sve/asinh.c197
-rw-r--r--math/aarch64/sve/asinhf.c (renamed from pl/math/sv_asinhf_2u5.c)38
-rw-r--r--math/aarch64/sve/atan.c (renamed from pl/math/sv_atan_2u5.c)22
-rw-r--r--math/aarch64/sve/atan2.c (renamed from pl/math/sv_atan2_2u5.c)54
-rw-r--r--math/aarch64/sve/atan2f.c (renamed from pl/math/sv_atan2f_3u.c)55
-rw-r--r--math/aarch64/sve/atanf.c (renamed from pl/math/sv_atanf_2u9.c)22
-rw-r--r--math/aarch64/sve/atanh.c (renamed from pl/math/sv_atanh_3u3.c)24
-rw-r--r--math/aarch64/sve/atanhf.c (renamed from pl/math/sv_atanhf_2u8.c)33
-rw-r--r--math/aarch64/sve/cbrt.c (renamed from pl/math/sv_cbrt_2u.c)35
-rw-r--r--math/aarch64/sve/cbrtf.c (renamed from pl/math/sv_cbrtf_1u7.c)16
-rw-r--r--math/aarch64/sve/cexpi.c (renamed from pl/math/sv_cexpi_3u5.c)17
-rw-r--r--math/aarch64/sve/cexpif.c (renamed from pl/math/sv_cexpif_1u8.c)17
-rw-r--r--math/aarch64/sve/cos.c (renamed from pl/math/sv_cos_2u5.c)16
-rw-r--r--math/aarch64/sve/cosf.c (renamed from pl/math/sv_cosf_2u1.c)16
-rw-r--r--math/aarch64/sve/cosh.c (renamed from pl/math/sv_cosh_2u.c)34
-rw-r--r--math/aarch64/sve/coshf.c62
-rw-r--r--math/aarch64/sve/cospi.c (renamed from pl/math/sv_cospi_3u2.c)25
-rw-r--r--math/aarch64/sve/cospif.c (renamed from pl/math/sv_cospif_2u6.c)25
-rw-r--r--math/aarch64/sve/erf.c (renamed from pl/math/sv_erf_2u5.c)28
-rw-r--r--math/aarch64/sve/erfc.c (renamed from pl/math/sv_erfc_1u8.c)24
-rw-r--r--math/aarch64/sve/erfcf.c (renamed from pl/math/sv_erfcf_1u7.c)36
-rw-r--r--math/aarch64/sve/erff.c (renamed from pl/math/sv_erff_2u.c)33
-rw-r--r--math/aarch64/sve/exp.c (renamed from pl/math/sv_exp_1u5.c)56
-rw-r--r--math/aarch64/sve/exp10.c (renamed from pl/math/sv_exp10_1u5.c)43
-rw-r--r--math/aarch64/sve/exp10f.c101
-rw-r--r--math/aarch64/sve/exp2.c (renamed from pl/math/sv_exp2_2u.c)44
-rw-r--r--math/aarch64/sve/exp2f.c83
-rw-r--r--math/aarch64/sve/expf.c50
-rw-r--r--math/aarch64/sve/expm1.c (renamed from pl/math/sv_expm1_2u5.c)20
-rw-r--r--math/aarch64/sve/expm1f.c (renamed from pl/math/sv_expm1f_1u6.c)46
-rw-r--r--math/aarch64/sve/hypot.c (renamed from pl/math/sv_hypot_1u5.c)20
-rw-r--r--math/aarch64/sve/hypotf.c (renamed from pl/math/sv_hypotf_1u5.c)20
-rw-r--r--math/aarch64/sve/log.c97
-rw-r--r--math/aarch64/sve/log10.c101
-rw-r--r--math/aarch64/sve/log10f.c (renamed from pl/math/sv_log10f_3u5.c)65
-rw-r--r--math/aarch64/sve/log1p.c (renamed from pl/math/sv_log1p_2u5.c)24
-rw-r--r--math/aarch64/sve/log1pf.c43
-rw-r--r--math/aarch64/sve/log2.c96
-rw-r--r--math/aarch64/sve/log2f.c (renamed from pl/math/sv_log2f_2u5.c)62
-rw-r--r--math/aarch64/sve/logf.c (renamed from pl/math/sv_logf_3u4.c)64
-rw-r--r--math/aarch64/sve/modf.c36
-rw-r--r--math/aarch64/sve/modff.c36
-rw-r--r--math/aarch64/sve/pow.c (renamed from pl/math/sv_pow_1u5.c)295
-rw-r--r--math/aarch64/sve/powf.c (renamed from pl/math/sv_powf_2u6.c)157
-rw-r--r--math/aarch64/sve/sin.c (renamed from pl/math/sv_sin_3u5.c)16
-rw-r--r--math/aarch64/sve/sincos.c (renamed from pl/math/sv_sincos_3u5.c)36
-rw-r--r--math/aarch64/sve/sincosf.c (renamed from pl/math/sv_sincosf_1u8.c)36
-rw-r--r--math/aarch64/sve/sincospi.c47
-rw-r--r--math/aarch64/sve/sincospif.c46
-rw-r--r--math/aarch64/sve/sinf.c (renamed from pl/math/sv_sinf_1u9.c)16
-rw-r--r--math/aarch64/sve/sinh.c (renamed from pl/math/sv_sinh_3u.c)20
-rw-r--r--math/aarch64/sve/sinhf.c (renamed from pl/math/sv_sinhf_2u3.c)21
-rw-r--r--math/aarch64/sve/sinpi.c (renamed from pl/math/sv_sinpi_3u1.c)33
-rw-r--r--math/aarch64/sve/sinpif.c (renamed from pl/math/sv_sinpif_2u5.c)33
-rw-r--r--math/aarch64/sve/sv_expf_inline.h66
-rw-r--r--math/aarch64/sve/sv_expm1f_inline.h (renamed from pl/math/sv_expm1f_inline.h)36
-rw-r--r--math/aarch64/sve/sv_log1p_inline.h (renamed from pl/math/sv_log1p_inline.h)14
-rw-r--r--math/aarch64/sve/sv_log1pf_inline.h83
-rw-r--r--math/aarch64/sve/sv_log_inline.h83
-rw-r--r--math/aarch64/sve/sv_math.h (renamed from pl/math/sv_math.h)32
-rw-r--r--math/aarch64/sve/sv_poly_f32.h (renamed from pl/math/poly_sve_f32.h)8
-rw-r--r--math/aarch64/sve/sv_poly_f64.h (renamed from pl/math/poly_sve_f64.h)8
-rw-r--r--math/aarch64/sve/sv_poly_generic.h (renamed from pl/math/poly_sve_generic.h)32
-rw-r--r--math/aarch64/sve/sv_sincos_common.h (renamed from pl/math/sv_sincos_common.h)4
-rw-r--r--math/aarch64/sve/sv_sincosf_common.h (renamed from pl/math/sv_sincosf_common.h)2
-rw-r--r--math/aarch64/sve/sv_sincospi_common.h76
-rw-r--r--math/aarch64/sve/sv_sincospif_common.h82
-rw-r--r--math/aarch64/sve/tan.c131
-rw-r--r--math/aarch64/sve/tanf.c (renamed from pl/math/sv_tanf_3u5.c)46
-rw-r--r--math/aarch64/sve/tanh.c (renamed from pl/math/sv_tanh_3u.c)20
-rw-r--r--math/aarch64/sve/tanhf.c68
-rw-r--r--math/aarch64/sve/tanpi.c89
-rw-r--r--math/aarch64/sve/tanpif.c68
-rw-r--r--math/aarch64/tanpi_2u5.c158
-rw-r--r--math/aarch64/tanpif_3u1.c145
-rw-r--r--math/aarch64/v_erf_data.c (renamed from pl/math/erf_data.c)10
-rw-r--r--math/aarch64/v_erfc_data.c (renamed from pl/math/erfc_data.c)10
-rw-r--r--math/aarch64/v_erfcf_data.c (renamed from pl/math/erfcf_data.c)10
-rw-r--r--math/aarch64/v_erff_data.c (renamed from pl/math/erff_data.c)10
-rw-r--r--math/aarch64/v_exp2f_1u.c72
-rw-r--r--math/aarch64/v_exp_data.c99
-rw-r--r--math/aarch64/v_exp_tail_data.c (renamed from pl/math/v_exp_tail_data.c)4
-rw-r--r--math/aarch64/v_expf_1u.c77
-rw-r--r--math/aarch64/v_log.c100
-rw-r--r--math/aarch64/v_log10_data.c (renamed from pl/math/v_log10_data.c)2
-rw-r--r--math/aarch64/v_log2_data.c (renamed from pl/math/v_log2_data.c)2
-rw-r--r--math/aarch64/v_log_data.c25
-rw-r--r--math/aarch64/v_logf.c74
-rw-r--r--math/aarch64/v_math.h135
-rw-r--r--math/aarch64/v_pow.c22
-rw-r--r--math/aarch64/v_pow_exp_data.c (renamed from pl/math/v_pow_exp_data.c)2
-rw-r--r--math/aarch64/v_pow_log_data.c (renamed from pl/math/v_pow_log_data.c)2
-rw-r--r--math/aarch64/v_powf.c148
-rw-r--r--math/aarch64/v_powf_data.c (renamed from pl/math/v_powf_data.c)2
-rw-r--r--math/cosf.c10
-rw-r--r--math/erf.c12
-rw-r--r--math/erff.c12
-rw-r--r--math/exp.c25
-rw-r--r--math/exp10.c22
-rw-r--r--math/exp2.c11
-rw-r--r--math/exp2f.c10
-rw-r--r--math/expf.c10
-rw-r--r--math/include/mathlib.h294
-rw-r--r--math/include/test_defs.h21
-rw-r--r--math/include/test_sig.h47
-rw-r--r--math/log.c11
-rw-r--r--math/log10f.c (renamed from pl/math/log10f.c)24
-rw-r--r--math/log2.c11
-rw-r--r--math/log2f.c11
-rw-r--r--math/logf.c11
-rw-r--r--math/logf_data.c3
-rw-r--r--math/math_config.h261
-rw-r--r--math/poly_generic.h (renamed from pl/math/poly_generic.h)2
-rw-r--r--math/poly_scalar_f32.h (renamed from pl/math/poly_scalar_f32.h)6
-rw-r--r--math/poly_scalar_f64.h (renamed from pl/math/poly_scalar_f64.h)6
-rw-r--r--math/pow.c22
-rw-r--r--math/powf.c12
-rw-r--r--math/sincosf.c12
-rw-r--r--math/sincosf.h5
-rw-r--r--math/sinf.c10
-rw-r--r--math/test/mathbench.c229
-rw-r--r--math/test/mathbench_funcs.h141
-rw-r--r--math/test/mathbench_wrappers.h302
-rw-r--r--math/test/mathtest.c12
-rw-r--r--math/test/rtest/dotest.c45
-rwxr-xr-xmath/test/runulp.sh311
-rw-r--r--math/test/test_defs.h31
-rw-r--r--math/test/testcases/directed/acos.tst (renamed from pl/math/test/testcases/directed/acos.tst)2
-rw-r--r--math/test/testcases/directed/acosf.tst (renamed from pl/math/test/testcases/directed/acosf.tst)2
-rw-r--r--math/test/testcases/directed/acosh.tst (renamed from pl/math/test/testcases/directed/acosh.tst)2
-rw-r--r--math/test/testcases/directed/acoshf.tst (renamed from pl/math/test/testcases/directed/acoshf.tst)2
-rw-r--r--math/test/testcases/directed/asin.tst (renamed from pl/math/test/testcases/directed/asin.tst)2
-rw-r--r--math/test/testcases/directed/asinf.tst (renamed from pl/math/test/testcases/directed/asinf.tst)2
-rw-r--r--math/test/testcases/directed/asinh.tst (renamed from pl/math/test/testcases/directed/asinh.tst)2
-rw-r--r--math/test/testcases/directed/asinhf.tst (renamed from pl/math/test/testcases/directed/asinhf.tst)2
-rw-r--r--math/test/testcases/directed/atan.tst (renamed from pl/math/test/testcases/directed/atan.tst)2
-rw-r--r--math/test/testcases/directed/atan2.tst (renamed from pl/math/test/testcases/directed/atan2.tst)2
-rw-r--r--math/test/testcases/directed/atan2f.tst (renamed from pl/math/test/testcases/directed/atan2f.tst)2
-rw-r--r--math/test/testcases/directed/atanf.tst (renamed from pl/math/test/testcases/directed/atanf.tst)2
-rw-r--r--math/test/testcases/directed/atanh.tst (renamed from pl/math/test/testcases/directed/atanh.tst)2
-rw-r--r--math/test/testcases/directed/atanhf.tst (renamed from pl/math/test/testcases/directed/atanhf.tst)2
-rw-r--r--math/test/testcases/directed/cbrtf.tst (renamed from pl/math/test/testcases/directed/cbrtf.tst)2
-rw-r--r--math/test/testcases/directed/cosh.tst (renamed from pl/math/test/testcases/directed/cosh.tst)2
-rw-r--r--math/test/testcases/directed/coshf.tst (renamed from pl/math/test/testcases/directed/coshf.tst)2
-rw-r--r--math/test/testcases/directed/erfc.tst (renamed from pl/math/test/testcases/directed/erfc.tst)2
-rw-r--r--math/test/testcases/directed/erfcf.tst (renamed from pl/math/test/testcases/directed/erfcf.tst)2
-rw-r--r--math/test/testcases/directed/expm1.tst (renamed from pl/math/test/testcases/directed/expm1.tst)2
-rw-r--r--math/test/testcases/directed/expm1f.tst (renamed from pl/math/test/testcases/directed/expm1f.tst)2
-rw-r--r--math/test/testcases/directed/log10.tst (renamed from pl/math/test/testcases/directed/log10.tst)2
-rw-r--r--math/test/testcases/directed/log10f.tst (renamed from pl/math/test/testcases/directed/log10f.tst)2
-rw-r--r--math/test/testcases/directed/log1p.tst (renamed from pl/math/test/testcases/directed/log1p.tst)2
-rw-r--r--math/test/testcases/directed/log1pf.tst (renamed from pl/math/test/testcases/directed/log1pf.tst)2
-rw-r--r--math/test/testcases/directed/sinh.tst (renamed from pl/math/test/testcases/directed/sinh.tst)2
-rw-r--r--math/test/testcases/directed/sinhf.tst (renamed from pl/math/test/testcases/directed/sinhf.tst)2
-rw-r--r--math/test/testcases/directed/tanf.tst (renamed from pl/math/test/testcases/directed/tanf.tst)2
-rw-r--r--math/test/testcases/directed/tanh.tst (renamed from pl/math/test/testcases/directed/tanh.tst)2
-rw-r--r--math/test/testcases/directed/tanhf.tst (renamed from pl/math/test/testcases/directed/tanhf.tst)2
-rw-r--r--math/test/trigpi_references.h106
-rw-r--r--math/test/ulp.c328
-rw-r--r--math/test/ulp.h41
-rw-r--r--math/test/ulp_funcs.h119
-rw-r--r--math/test/ulp_wrappers.h418
-rw-r--r--math/tgamma128.c2
-rw-r--r--math/tools/asin.sollya (renamed from pl/math/tools/asin.sollya)2
-rw-r--r--math/tools/asinf.sollya (renamed from pl/math/tools/asinf.sollya)2
-rw-r--r--math/tools/asinh.sollya (renamed from pl/math/tools/asinh.sollya)2
-rw-r--r--math/tools/asinhf.sollya (renamed from pl/math/tools/asinhf.sollya)2
-rw-r--r--math/tools/atan.sollya (renamed from pl/math/tools/atan.sollya)2
-rw-r--r--math/tools/atanf.sollya (renamed from pl/math/tools/atanf.sollya)2
-rw-r--r--math/tools/cbrt.sollya (renamed from pl/math/tools/cbrt.sollya)2
-rw-r--r--math/tools/cbrtf.sollya (renamed from pl/math/tools/cbrtf.sollya)2
-rw-r--r--math/tools/erf.sollya (renamed from pl/math/tools/erf.sollya)2
-rw-r--r--math/tools/erfc.sollya (renamed from pl/math/tools/erfc.sollya)2
-rw-r--r--math/tools/erfcf.sollya (renamed from pl/math/tools/erfcf.sollya)2
-rw-r--r--math/tools/erff.sollya (renamed from pl/math/tools/erff.sollya)2
-rw-r--r--math/tools/exp10.sollya (renamed from pl/math/tools/exp10.sollya)2
-rw-r--r--math/tools/expm1.sollya (renamed from pl/math/tools/expm1.sollya)2
-rw-r--r--math/tools/expm1f.sollya (renamed from pl/math/tools/expm1f.sollya)2
-rw-r--r--math/tools/log10.sollya (renamed from pl/math/tools/log10.sollya)2
-rw-r--r--math/tools/log10f.sollya (renamed from pl/math/tools/log10f.sollya)2
-rw-r--r--math/tools/log1p.sollya (renamed from pl/math/tools/log1p.sollya)2
-rw-r--r--math/tools/log1pf.sollya (renamed from pl/math/tools/log1pf.sollya)2
-rw-r--r--math/tools/sincos.sollya (renamed from pl/math/tools/sincos.sollya)4
-rw-r--r--math/tools/sincosf.sollya (renamed from pl/math/tools/sincosf.sollya)2
-rw-r--r--math/tools/sinpi.sollya (renamed from pl/math/tools/sinpi.sollya)2
-rw-r--r--math/tools/tan.sollya (renamed from pl/math/tools/tan.sollya)2
-rw-r--r--math/tools/tanf.sollya (renamed from pl/math/tools/tanf.sollya)2
-rw-r--r--math/tools/tanpi.sollya48
-rw-r--r--math/tools/v_erf.sollya (renamed from pl/math/tools/v_erf.sollya)2
-rw-r--r--math/tools/v_erfc.sollya (renamed from pl/math/tools/v_erfc.sollya)2
-rw-r--r--math/tools/v_log10.sollya (renamed from pl/math/tools/v_log10.sollya)2
-rw-r--r--math/tools/v_log10f.sollya (renamed from pl/math/tools/v_log10f.sollya)2
-rw-r--r--math/tools/v_log2f.sollya (renamed from pl/math/tools/v_log2f.sollya)2
-rw-r--r--networking/Dir.mk6
-rw-r--r--pl/Dir.mk21
-rw-r--r--pl/math/Dir.mk216
-rw-r--r--pl/math/asinhf_data.c15
-rw-r--r--pl/math/atan_data.c20
-rw-r--r--pl/math/atanf_data.c15
-rw-r--r--pl/math/exp_data.c1120
-rw-r--r--pl/math/expf.c76
-rw-r--r--pl/math/expm1_data.c21
-rw-r--r--pl/math/include/mathlib.h206
-rw-r--r--pl/math/include/pl_test.h24
-rw-r--r--pl/math/log.c161
-rw-r--r--pl/math/log1p_data.c19
-rw-r--r--pl/math/log_data.c511
-rw-r--r--pl/math/logf.c75
-rw-r--r--pl/math/logf_data.c36
-rw-r--r--pl/math/math_config.h624
-rw-r--r--pl/math/math_err.c78
-rw-r--r--pl/math/math_errf.c78
-rw-r--r--pl/math/pl_sig.h59
-rw-r--r--pl/math/sv_acosh_3u5.c50
-rw-r--r--pl/math/sv_acoshf_2u8.c47
-rw-r--r--pl/math/sv_asinh_3u0.c129
-rw-r--r--pl/math/sv_coshf_2u.c56
-rw-r--r--pl/math/sv_erf_data.c1558
-rw-r--r--pl/math/sv_erff_data.c1046
-rw-r--r--pl/math/sv_exp10f_1u5.c87
-rw-r--r--pl/math/sv_exp2f_1u6.c80
-rw-r--r--pl/math/sv_expf_2u.c86
-rw-r--r--pl/math/sv_expf_inline.h66
-rw-r--r--pl/math/sv_log10_2u5.c75
-rw-r--r--pl/math/sv_log1pf_1u3.c97
-rw-r--r--pl/math/sv_log1pf_inline.h65
-rw-r--r--pl/math/sv_log2_3u.c73
-rw-r--r--pl/math/sv_log_2u5.c76
-rw-r--r--pl/math/sv_tan_3u5.c99
-rw-r--r--pl/math/sv_tanhf_2u6.c59
-rw-r--r--pl/math/test/mathbench_funcs.h87
-rw-r--r--pl/math/test/mathbench_wrappers.h206
-rw-r--r--pl/math/test/pl_test.h39
-rwxr-xr-xpl/math/test/runulp.sh78
-rw-r--r--pl/math/test/testcases/directed/erff.tst17
-rw-r--r--pl/math/test/testcases/directed/log2.tst21
-rw-r--r--pl/math/test/testcases/directed/log2f.tst27
-rw-r--r--pl/math/test/testcases/random/double.tst6
-rw-r--r--pl/math/test/testcases/random/float.tst8
-rw-r--r--pl/math/test/ulp_funcs.h70
-rw-r--r--pl/math/test/ulp_wrappers.h140
-rw-r--r--pl/math/trigpi_references.c57
-rw-r--r--pl/math/v_asinh_3u5.c175
-rw-r--r--pl/math/v_asinhf_2u7.c80
-rw-r--r--pl/math/v_atan2_3u.c121
-rw-r--r--pl/math/v_exp_data.c55
-rw-r--r--pl/math/v_exp_tail.h21
-rw-r--r--pl/math/v_exp_tail_inline.h102
-rw-r--r--pl/math/v_expf_inline.h60
-rw-r--r--pl/math/v_expm1_2u5.c118
-rw-r--r--pl/math/v_expm1f_1u6.c117
-rw-r--r--pl/math/v_expm1f_inline.h63
-rw-r--r--pl/math/v_log10_2u5.c120
-rw-r--r--pl/math/v_log10f_3u5.c82
-rw-r--r--pl/math/v_log1p_2u5.c128
-rw-r--r--pl/math/v_log1p_inline.h91
-rw-r--r--pl/math/v_log1pf_2u1.c126
-rw-r--r--pl/math/v_log1pf_inline.h67
-rw-r--r--pl/math/v_log2_3u.c109
-rw-r--r--pl/math/v_log2f_2u5.c77
-rw-r--r--pl/math/v_log_data.c161
-rw-r--r--pl/math/v_sinh_3u.c118
-rw-r--r--pl/math/v_tanh_3u.c106
-rw-r--r--string/Dir.mk9
-rw-r--r--string/aarch64/__mtag_tag_region.S3
-rw-r--r--string/aarch64/__mtag_tag_zero_region.S3
-rw-r--r--string/aarch64/asmdefs.h37
-rw-r--r--string/aarch64/experimental/memchr-sve.S (renamed from string/aarch64/memchr-sve.S)8
-rw-r--r--string/aarch64/experimental/memcmp-sve.S (renamed from string/aarch64/memcmp-sve.S)9
-rw-r--r--string/aarch64/experimental/stpcpy-sve.S (renamed from string/aarch64/stpcpy-sve.S)0
-rw-r--r--string/aarch64/experimental/strchr-sve.S (renamed from string/aarch64/strchr-sve.S)7
-rw-r--r--string/aarch64/experimental/strchrnul-sve.S (renamed from string/aarch64/strchrnul-sve.S)0
-rw-r--r--string/aarch64/experimental/strcmp-sve.S (renamed from string/aarch64/strcmp-sve.S)8
-rw-r--r--string/aarch64/experimental/strcpy-sve.S (renamed from string/aarch64/strcpy-sve.S)8
-rw-r--r--string/aarch64/experimental/strlen-sve.S (renamed from string/aarch64/strlen-sve.S)7
-rw-r--r--string/aarch64/experimental/strncmp-sve.S (renamed from string/aarch64/strncmp-sve.S)9
-rw-r--r--string/aarch64/experimental/strnlen-sve.S (renamed from string/aarch64/strnlen-sve.S)8
-rw-r--r--string/aarch64/experimental/strrchr-sve.S (renamed from string/aarch64/strrchr-sve.S)7
-rw-r--r--string/aarch64/memchr-mte.S2
-rw-r--r--string/aarch64/memchr.S2
-rw-r--r--string/aarch64/memcmp.S4
-rw-r--r--string/aarch64/memcpy-advsimd.S3
-rw-r--r--string/aarch64/memcpy-mops.S4
-rw-r--r--string/aarch64/memcpy-sve.S8
-rw-r--r--string/aarch64/memcpy.S3
-rw-r--r--string/aarch64/memmove-mops.S4
-rw-r--r--string/aarch64/memrchr.S1
-rw-r--r--string/aarch64/memset-mops.S3
-rw-r--r--string/aarch64/memset-sve.S114
-rw-r--r--string/aarch64/memset.S104
-rw-r--r--string/aarch64/strchr-mte.S1
-rw-r--r--string/aarch64/strchr.S1
-rw-r--r--string/aarch64/strchrnul-mte.S1
-rw-r--r--string/aarch64/strchrnul.S1
-rw-r--r--string/aarch64/strcmp.S2
-rw-r--r--string/aarch64/strcpy.S2
-rw-r--r--string/aarch64/strlen-mte.S38
-rw-r--r--string/aarch64/strlen.S1
-rw-r--r--string/aarch64/strncmp.S3
-rw-r--r--string/aarch64/strnlen.S2
-rw-r--r--string/aarch64/strrchr-mte.S1
-rw-r--r--string/aarch64/strrchr.S1
-rw-r--r--string/bench/memcpy.c239
-rw-r--r--string/bench/memset.c141
-rw-r--r--string/bench/strlen.c206
-rw-r--r--string/include/benchlib.h31
-rw-r--r--string/include/stringlib.h3
-rw-r--r--string/test/memcpy.c2
-rw-r--r--string/test/memmove.c2
-rw-r--r--string/test/memset.c3
472 files changed, 11852 insertions, 14525 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 6c5823a8dbce..06cceb8f2501 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1,12 +1,9 @@
/
- Szabolcs Nagy <szabolcs.nagy@arm.com>
+ Tamar Christina <tamar.christina@arm.com>
math/
- Szabolcs Nagy <szabolcs.nagy@arm.com>
-networking/
- Szabolcs Nagy <szabolcs.nagy@arm.com>
-pl/
Pierre Blanchard <pierre.blanchard@arm.com>
Joe Ramsay <joe.ramsay@arm.com>
+networking/
+ Ola Liljedahl <ola.liljedahl@arm.com>
string/
- Szabolcs Nagy <szabolcs.nagy@arm.com>
Wilco Dijkstra <wilco.dijkstra@arm.com>
diff --git a/Makefile b/Makefile
index c487896728c2..e7503dbd2f60 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
# Makefile - requires GNU make
#
-# Copyright (c) 2018-2022, Arm Limited.
+# Copyright (c) 2018-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
srcdir = .
@@ -11,7 +11,6 @@ includedir = $(prefix)/include
# Configure these in config.mk, do not make changes in this file.
SUBS = math string networking
-PLSUBS = math
HOST_CC = cc
HOST_CFLAGS = -std=c99 -O2
HOST_LDFLAGS =
@@ -21,12 +20,22 @@ CPPFLAGS =
CFLAGS = -std=c99 -O2
CFLAGS_SHARED = -fPIC
CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
-CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
LDFLAGS =
LDLIBS =
AR = $(CROSS_COMPILE)ar
RANLIB = $(CROSS_COMPILE)ranlib
INSTALL = install
+# Detect OS.
+# Assume Unix environment: Linux, Darwin, or Msys.
+OS := $(shell uname -s)
+OS := $(patsubst MSYS%,Msys,$(OS))
+# Following math dependencies can be adjusted in config file
+# if necessary, e.g. for Msys.
+libm-libs = -lm
+libc-libs = -lc
+mpfr-libs = -lmpfr
+gmp-libs = -lgmp
+mpc-libs = -lmpc
all:
@@ -53,7 +62,6 @@ $(DIRS):
mkdir -p $@
$(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
-$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
build/%.o: $(srcdir)/%.S
$(CC) $(CFLAGS_ALL) -c -o $@ $<
diff --git a/README b/README
index 651ebdc84bc8..4bbed76d75c8 100644
--- a/README
+++ b/README
@@ -12,12 +12,25 @@ contribution requirements are documented in README.contributors of
the appropriate subdirectory.
Regular quarterly releases are tagged as vYY.MM, the latest
-release is v24.01.
+release is v25.01.
Source code layout:
build/ - build directory (created by make).
-math/ - math subproject sources.
+math/ - math subproject sources for generic scalar
+ subroutines and sources shared with
+ subdirectories of math/.
+ All math routines should meet the quality
+ requirements stated in math/README.contributors,
+ routines that fail to do so are located in an
+ experimental/ directory.
+math/aarch64/ - math subproject AArch64-specific sources
+ and sources shared with subdirectories.
+math/aarch64/advsimd - AdvSIMD-specific math sources.
+math/aarch64/experimental - Experimental math sources do not
+ meet quality requirements stated in
+ math/README.contributors.
+math/aarch64/sve - SVE-specific math sources.
math/include/ - math library public headers.
math/test/ - math test and benchmark related sources.
math/tools/ - tools used for designing the algorithms.
@@ -25,9 +38,16 @@ networking/ - networking subproject sources.
networking/include/ - networking library public headers.
networking/test/ - networking test and benchmark related sources.
string/ - string routines subproject sources.
+ All string routines should meet the quality
+ requirements stated in string/README.contributors,
+ routines that fail to do so are located in an
+ experimental/ directory.
+string/<arch> - <arch>-specific string routines sources for
+ <arch>=aarch64, and arm.
+string/aarch64/experimental - Experimental string routines which
+ may not be fully optimized yet.
string/include/ - string library public headers.
string/test/ - string test and benchmark related sources.
-pl/... - separately maintained performance library code.
The steps to build the target libraries and run the tests:
@@ -50,6 +70,13 @@ Or building and testing the math subproject only:
make all-math
make check-math
+Note on compiler compability/requirement:
+
+SVE routines are always built by default - this means that on AArch64
+GCC >= 10 or LLVM >= 5 are always required for SVE ACLE compatibility.
+There is no explicit check for compatible compiler, therefore the SVE
+routines will fail to build if CC is too old.
+
The test system requires libmpfr and libmpc.
For example on debian linux they can be installed as:
diff --git a/config.mk.dist b/config.mk.dist
index 03fb54db52fa..ae4574e7cdba 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,14 +1,11 @@
# Example config.mk
#
-# Copyright (c) 2018-2023, Arm Limited.
+# Copyright (c) 2018-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
# Subprojects to build
SUBS = math string networking
-# Subsubprojects to build if subproject pl is built
-PLSUBS = math
-
# Target architecture: aarch64, arm or x86_64
ARCH = aarch64
@@ -30,6 +27,27 @@ HOST_CFLAGS += -Wall -Wno-unused-function
HOST_CFLAGS += -g
CFLAGS += -g
+ifeq ($(OS),Msys)
+ # llvm is the only available/valid native compiler
+ CC = clang
+ AR = llvm-ar
+ RANLIB = llvm-ranlib
+ HOST_CC = clang
+ SYSROOT = /c/wenv/msys2/msys64/clangarm64
+ # Common windows flags
+ COMMON_WIN_CFLAGS = -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE
+ COMMON_WIN_CFLAGS += -Wno-deprecated-declarations -Wno-unused-variable
+ # For mathtest
+ HOST_CFLAGS += -I$(SYSROOT)/include
+ HOST_CFLAGS += $(COMMON_WIN_CFLAGS) -Wno-ignored-attributes
+ # Clear the default flag -fPIC, as not supported on Windows
+ CFLAGS_SHARED =
+ # For ulp.h with MPFR
+ CFLAGS += -I$(SYSROOT)/include
+ # For clang on Windows
+ CFLAGS += $(COMMON_WIN_CFLAGS)
+endif
+
# Optimize the shared libraries on aarch64 assuming they fit in 1M.
#CFLAGS_SHARED = -fPIC -mcmodel=tiny
@@ -45,12 +63,33 @@ math-cflags =
math-ldlibs =
math-ulpflags =
math-testflags =
-string-cflags =
+string-cflags = -falign-functions=64
networking-cflags =
-# Use if mpfr is available on the target for ulp error checking.
-#math-ldlibs += -lmpfr -lgmp
-#math-cflags += -DUSE_MPFR
+ifeq ($(OS),Msys)
+ # Libraries can be installed with pacman
+ libm-libs = -lmsvcrt -lvcruntime -lucrt
+ libc-libs =
+ # Linker will look for .lib but some systems only have .dll.a,
+ # therefore we have to give absolute path to libraries.
+ # This is system dependent and might need adjusting.
+ mpfr-libs = $(SYSROOT)/lib/libmpfr.dll.a
+ gmp-libs = $(SYSROOT)/lib/libgmp.dll.a
+ mpc-libs = $(SYSROOT)/lib/libmpc.dll.a
+endif
+
+# Use if mpfr is available on the target for ulp error checking. If
+# enabling this, it is advised to disable fenv checks by uncommenting
+# the two lines at the bottom of this block.
+USE_MPFR=0
+math-cflags += -DUSE_MPFR=$(USE_MPFR)
+ifeq ($(USE_MPFR), 1)
+ math-ldlibs += $(mpfr-libs) $(gmp-libs)
+ math-ulpflags += -m
+endif
+# Disable fenv checks
+#math-ulpflags = -q -f
+#math-testflags = -nostatus
# Use with gcc.
math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector
@@ -59,30 +98,36 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Use with clang.
#math-cflags += -ffp-contract=fast
-# Disable/enable SVE vector math code and tests.
-# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
-# routines only so that SVE code does not leak into scalar
-# routines. It is also necessary to add it for tools (e.g. ulp,
-# mathbench)
-WANT_SVE_MATH = 0
-ifeq ($(WANT_SVE_MATH), 1)
- math-sve-cflags = -march=armv8-a+sve
-endif
-math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
-
# If defined to 1, set errno in math functions according to ISO C. Many math
# libraries do not set errno, so this is 0 by default. It may need to be
# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
WANT_ERRNO = 0
math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
+# Disable/enable SVE vector math tests/tools.
+ifeq ($(ARCH),aarch64)
+ WANT_SVE_TESTS = 1
+else
+ WANT_SVE_TESTS = 0
+endif
+math-cflags += -DWANT_SVE_TESTS=$(WANT_SVE_TESTS)
+
# If set to 1, set fenv in vector math routines.
WANT_SIMD_EXCEPT = 0
math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
-# Disable fenv checks
-#math-ulpflags = -q -f
-#math-testflags = -nostatus
+# If set to 1, enable tests for exp10.
+WANT_EXP10_TESTS = 1
+math-cflags += -DWANT_EXP10_TESTS=$(WANT_EXP10_TESTS)
+
+# If set to 1, enable tests for sinpi and cospi. These functions are
+# only supported on aarch64
+ifeq ($(ARCH),aarch64)
+ WANT_TRIGPI_TESTS = 1
+else
+ WANT_TRIGPI_TESTS = 0
+endif
+math-cflags += -DWANT_TRIGPI_TESTS=$(WANT_TRIGPI_TESTS)
# Remove GNU Property Notes from asm files.
#string-cflags += -DWANT_GNU_PROPERTY=0
@@ -92,3 +137,13 @@ math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
# Avoid auto-vectorization of scalar code and unroll loops
networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
+
+# Provide *_finite symbols and some of the glibc hidden symbols
+# so libmathlib can be used with binaries compiled against glibc
+# to interpose math functions with both static and dynamic linking
+USE_GLIBC_ABI = 1
+math-cflags += -DUSE_GLIBC_ABI=$(USE_GLIBC_ABI)
+
+# Enable experimental math routines - non-C23 vector math and low-accuracy scalar
+WANT_EXPERIMENTAL_MATH = 0
+math-cflags += -DWANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH)
diff --git a/math/Dir.mk b/math/Dir.mk
index 5e9494a7bd3c..6277241ac4de 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,23 +1,61 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2023, Arm Limited.
+# Copyright (c) 2019-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-S := $(srcdir)/math
-B := build/math
-
-math-lib-srcs := $(wildcard $(S)/*.[cS])
-math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+.SECONDEXPANSION:
+
+ifneq ($(OS),Linux)
+ ifeq ($(WANT_SIMD_EXCEPT),1)
+ $(error WANT_SIMD_EXCEPT is not supported outside Linux)
+ endif
+ ifneq ($(USE_MPFR),1)
+ $(warning WARNING: Double-precision ULP tests will not be usable without MPFR)
+ endif
+ ifeq ($(USE_GLIBC_ABI),1)
+ $(error Can only generate special GLIBC symbols on Linux - please disable USE_GLIBC_ABI)
+ endif
+endif
+
+ifneq ($(ARCH),aarch64)
+ ifeq ($(WANT_TRIGPI_TESTS),1)
+ $(error trigpi functions only supported on aarch64)
+ endif
+ ifeq ($(WANT_EXPERIMENTAL_MATH),1)
+ $(error Experimental math only supported on aarch64)
+ endif
+endif
+
+math-src-dir := $(srcdir)/math
+math-build-dir := build/math
+
+math-lib-srcs := $(wildcard $(math-src-dir)/*.[cS])
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*.[cS])
+ifeq ($(OS),Linux)
+# Vector symbols only supported on Linux
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*/*.[cS])
+endif
+
+ifeq ($(WANT_EXPERIMENTAL_MATH), 1)
+ifeq ($(OS),Linux)
+# Vector symbols only supported on Linux
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*/*.[cS])
+else
+math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*.[cS])
+endif
+else
+# Scalar experimental symbols will have been added by wildcard, so remove them
+math-lib-srcs := $(filter-out $(math-src-dir)/aarch64/experimental/%, $(math-lib-srcs))
+endif
math-test-srcs := \
- $(S)/test/mathtest.c \
- $(S)/test/mathbench.c \
- $(S)/test/ulp.c \
+ $(math-src-dir)/test/mathtest.c \
+ $(math-src-dir)/test/mathbench.c \
+ $(math-src-dir)/test/ulp.c \
-math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
+math-test-host-srcs := $(wildcard $(math-src-dir)/test/rtest/*.[cS])
-math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
-math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
+math-includes := $(patsubst $(math-src-dir)/%,build/%,$(wildcard $(math-src-dir)/include/*.h))
math-libs := \
build/lib/libmathlib.so \
@@ -33,9 +71,9 @@ math-tools := \
math-host-tools := \
build/bin/rtest \
-math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
-math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs)))
-math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
+math-lib-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-lib-srcs)))
+math-test-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-srcs)))
+math-host-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-host-srcs)))
math-target-objs := $(math-lib-objs) $(math-test-objs)
math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
@@ -44,18 +82,69 @@ math-files := \
$(math-libs) \
$(math-tools) \
$(math-host-tools) \
- $(math-includes) \
- $(math-test-includes) \
+ $(math-includes)
-all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes)
-$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): $(math-includes)
$(math-objs): CFLAGS_ALL += $(math-cflags)
-$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
+$(math-build-dir)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
$(math-host-objs): CC = $(HOST_CC)
$(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS)
-$(B)/test/ulp.o: $(S)/test/ulp.h
+# Add include path for experimental routines so they can share helpers with non-experimental
+$(math-build-dir)/aarch64/experimental/advsimd/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/advsimd
+$(math-build-dir)/aarch64/experimental/sve/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/sve
+
+$(math-objs): CFLAGS_ALL += -I$(math-src-dir)
+
+ulp-funcs-dir = build/test/ulp-funcs/
+ulp-wrappers-dir = build/test/ulp-wrappers/
+mathbench-funcs-dir = build/test/mathbench-funcs/
+test-sig-dirs = $(ulp-funcs-dir) $(ulp-wrappers-dir) $(mathbench-funcs-dir)
+build/include/test $(test-sig-dirs) $(addsuffix /$(ARCH),$(test-sig-dirs)) $(addsuffix /aarch64/experimental,$(test-sig-dirs)) \
+$(addsuffix /aarch64/experimental/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/experimental/sve,$(test-sig-dirs)) \
+$(addsuffix /aarch64/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/sve,$(test-sig-dirs)):
+ mkdir -p $@
+
+ulp-funcs = $(patsubst $(math-src-dir)/%,$(ulp-funcs-dir)/%,$(basename $(math-lib-srcs)))
+ulp-wrappers = $(patsubst $(math-src-dir)/%,$(ulp-wrappers-dir)/%,$(basename $(math-lib-srcs)))
+mathbench-funcs = $(patsubst $(math-src-dir)/%,$(mathbench-funcs-dir)/%,$(basename $(math-lib-srcs)))
+
+ifeq ($(WANT_SVE_TESTS), 0)
+ # Filter out anything with sve in the path
+ ulp-funcs := $(foreach a,$(ulp-funcs),$(if $(findstring sve,$a),,$a))
+ ulp-wrappers := $(foreach a,$(ulp-wrappers),$(if $(findstring sve,$a),,$a))
+ mathbench-funcs := $(foreach a,$(mathbench-funcs),$(if $(findstring sve,$a),,$a))
+endif
+
+define emit_sig
+$1/aarch64/experimental/sve/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/sve
+$1/aarch64/experimental/advsimd/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/advsimd
+$1/%.i: $(math-src-dir)/%.c | $$$$(@D)
+ $(CC) $$< $(math-cflags) -I$(math-src-dir)/include -I$(math-src-dir) $$(EXTRA_INC) -D$2 -E -o $$@
+$1/%: $1/%.i
+ { grep TEST_SIG $$< || true; } | cut -f 2- -d ' ' > $$@
+endef
+
+$(eval $(call emit_sig,$(ulp-funcs-dir),EMIT_ULP_FUNCS))
+$(eval $(call emit_sig,$(ulp-wrappers-dir),EMIT_ULP_WRAPPERS))
+$(eval $(call emit_sig,$(mathbench-funcs-dir),EMIT_MATHBENCH_FUNCS))
+
+ulp-funcs-gen = build/include/test/ulp_funcs_gen.h
+ulp-wrappers-gen = build/include/test/ulp_wrappers_gen.h
+mathbench-funcs-gen = build/include/test/mathbench_funcs_gen.h
+math-tools-autogen-headers = $(ulp-funcs-gen) $(ulp-wrappers-gen) $(mathbench-funcs-gen)
+
+$(ulp-funcs-gen): $(ulp-funcs) | $$(@D)
+$(ulp-wrappers-gen): $(ulp-wrappers) | $$(@D)
+$(mathbench-funcs-gen): $(mathbench-funcs) | $$(@D)
+
+$(math-tools-autogen-headers): | $$(@D)
+ cat $^ | sort -u > $@
+
+$(math-build-dir)/test/mathbench.o: $(mathbench-funcs-gen)
+$(math-build-dir)/test/ulp.o: $(math-src-dir)/test/ulp.h $(ulp-funcs-gen) $(ulp-wrappers-gen)
build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
@@ -65,38 +154,40 @@ build/lib/libmathlib.a: $(math-lib-objs)
$(AR) rc $@ $^
$(RANLIB) $@
-$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
-$(math-tools): LDLIBS += $(math-ldlibs) -lm
-# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
-$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
+$(math-host-tools): HOST_LDLIBS += $(libm-libs) $(mpfr-libs) $(mpc-libs)
+$(math-tools): LDLIBS += $(math-ldlibs) $(libm-libs)
+
+ifneq ($(OS),Darwin)
+ $(math-tools): LDFLAGS += -static
+endif
build/bin/rtest: $(math-host-objs)
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
-build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+build/bin/mathtest: $(math-build-dir)/test/mathtest.o build/lib/libmathlib.a
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)
-build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+build/bin/mathbench: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs)
# This is not ideal, but allows custom symbols in mathbench to get resolved.
-build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm
-
-build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
+build/bin/mathbench_libc: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $< $(libm-libs) $(libc-libs) build/lib/libmathlib.a $(libm-libs)
-build/include/%.h: $(S)/include/%.h
- cp $< $@
+build/bin/ulp: $(math-build-dir)/test/ulp.o build/lib/libmathlib.a
+ $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(LDLIBS)
-build/include/test/%.h: $(S)/test/%.h
+build/include/%.h: $(math-src-dir)/include/%.h
cp $< $@
-build/bin/%.sh: $(S)/test/%.sh
+build/bin/%.sh: $(math-src-dir)/test/%.sh
cp $< $@
-math-tests := $(wildcard $(S)/test/testcases/directed/*.tst)
-math-rtests := $(wildcard $(S)/test/testcases/random/*.tst)
+math-tests := $(wildcard $(math-src-dir)/test/testcases/directed/*.tst)
+ifneq ($(WANT_EXP10_TESTS),1)
+math-tests := $(filter-out %exp10.tst, $(math-tests))
+endif
+math-rtests := $(wildcard $(math-src-dir)/test/testcases/random/*.tst)
check-math-test: $(math-tools)
cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags)
@@ -104,8 +195,88 @@ check-math-test: $(math-tools)
check-math-rtest: $(math-host-tools) $(math-tools)
cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
+ulp-input-dir = $(math-build-dir)/test/inputs
+$(ulp-input-dir) $(ulp-input-dir)/$(ARCH) $(ulp-input-dir)/aarch64/sve $(ulp-input-dir)/aarch64/advsimd \
+$(ulp-input-dir)/aarch64/experimental $(ulp-input-dir)/aarch64/experimental/advsimd $(ulp-input-dir)/aarch64/experimental/sve:
+ mkdir -p $@
+
+math-lib-lims = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp,$(math-lib-srcs))
+math-lib-lims-nn = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp_nn,$(math-lib-srcs))
+math-lib-fenvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.fenv,$(math-lib-srcs))
+math-lib-itvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.itv,$(math-lib-srcs))
+math-lib-cvals = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.cval,$(math-lib-srcs))
+
+ulp-inputs = $(math-lib-lims) $(math-lib-lims-nn) $(math-lib-fenvs) $(math-lib-itvs) $(math-lib-cvals)
+$(ulp-inputs): CFLAGS = -I$(math-src-dir)/test -I$(math-src-dir)/include -I$(math-src-dir) $(math-cflags)\
+ -I$(math-src-dir)/aarch64/advsimd -I$(math-src-dir)/aarch64/sve
+
+$(ulp-input-dir)/%.ulp.i: $(math-src-dir)/%.c | $$(@D)
+ $(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.ulp: $(ulp-input-dir)/%.ulp.i
+ { grep "TEST_ULP " $< || true; } > $@
+
+$(ulp-input-dir)/%.ulp_nn.i: $(math-src-dir)/%.c | $$(@D)
+ $(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.ulp_nn: $(ulp-input-dir)/%.ulp_nn.i
+ { grep "TEST_ULP_NONNEAREST " $< || true; } > $@
+
+$(ulp-input-dir)/%.fenv.i: $(math-src-dir)/%.c | $$(@D)
+ $(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.fenv: $(ulp-input-dir)/%.fenv.i
+ { grep "TEST_DISABLE_FENV " $< || true; } > $@
+
+$(ulp-input-dir)/%.itv.i: $(math-src-dir)/%.c | $$(@D)
+ $(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.itv: $(ulp-input-dir)/%.itv.i
+ { grep "TEST_INTERVAL " $< || true; } | sed "s/ TEST_INTERVAL/\nTEST_INTERVAL/g" > $@
+
+$(ulp-input-dir)/%.cval.i: $(math-src-dir)/%.c | $$(@D)
+ $(CC) $(CFLAGS) $< -E -o $@
+
+$(ulp-input-dir)/%.cval: $(ulp-input-dir)/%.cval.i
+ { grep "TEST_CONTROL_VALUE " $< || true; } > $@
+
+ulp-lims = $(ulp-input-dir)/limits
+$(ulp-lims): $(math-lib-lims)
+
+ulp-lims-nn = $(ulp-input-dir)/limits_nn
+$(ulp-lims-nn): $(math-lib-lims-nn)
+
+fenv-exps := $(ulp-input-dir)/fenv
+$(fenv-exps): $(math-lib-fenvs)
+
+generic-itvs = $(ulp-input-dir)/itvs
+$(generic-itvs): $(filter-out $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
+
+arch-itvs = $(ulp-input-dir)/$(ARCH)/itvs
+$(arch-itvs): $(filter $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs))
+
+ulp-cvals := $(ulp-input-dir)/cvals
+$(ulp-cvals): $(math-lib-cvals)
+
+# Remove first word, which will be TEST directive
+$(ulp-lims) $(ulp-lims-nn) $(fenv-exps) $(arch-itvs) $(generic-itvs) $(ulp-cvals): | $$(@D)
+ sed "s/TEST_[^ ]* //g" $^ | sort -u > $@
+
+check-math-ulp: $(ulp-lims) $(ulp-lims-nn)
+check-math-ulp: $(fenv-exps) $(ulp-cvals)
+check-math-ulp: $(generic-itvs) $(arch-itvs)
check-math-ulp: $(math-tools)
- ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
+ ULPFLAGS="$(math-ulpflags)" \
+ LIMITS=../../$(ulp-lims) \
+ ARCH_ITVS=../../$(arch-itvs) \
+ GEN_ITVS=../../$(generic-itvs) \
+ DISABLE_FENV=../../$(fenv-exps) \
+ CVALS=../../$(ulp-cvals) \
+ FUNC=$(func) \
+ WANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH) \
+ WANT_SVE_TESTS=$(WANT_SVE_TESTS) \
+ USE_MPFR=$(USE_MPFR) \
+ build/bin/runulp.sh $(EMULATOR)
check-math: check-math-test check-math-rtest check-math-ulp
diff --git a/math/README.contributors b/math/README.contributors
index 33e7ba376e41..58a04fa4759d 100644
--- a/math/README.contributors
+++ b/math/README.contributors
@@ -1,8 +1,9 @@
STYLE REQUIREMENTS
==================
-1. Most code in this sub-directory is expected to be upstreamed into glibc so
- the GNU Coding Standard and glibc specific conventions should be followed
+1. With the exception of math/aarch64/experimental/, most code in this
+ sub-directory is expected to be upstreamed into glibc so the GNU
+ Coding Standard and glibc specific conventions should be followed
to ease upstreaming.
2. ABI and symbols: the code should be written so it is suitable for inclusion
diff --git a/pl/math/v_acos_2u.c b/math/aarch64/advsimd/acos.c
index 581f8506c0d6..7873a07e6f56 100644
--- a/pl/math/v_acos_2u.c
+++ b/math/aarch64/advsimd/acos.c
@@ -1,14 +1,14 @@
/*
* Double-precision vector acos(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -30,8 +30,8 @@ static const struct data
};
#define AllMask v_u64 (0xffffffffffffffff)
-#define Oneu (0x3ff0000000000000)
-#define Small (0x3e50000000000000) /* 2^-53. */
+#define Oneu 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-53. */
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
@@ -111,12 +111,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
return vfmaq_f64 (add, mul, y);
}
-PL_SIG (V, D, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_D1 (acos), 1.02)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
+TEST_SIG (V, D, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (acos), 1.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
diff --git a/pl/math/v_acosf_1u4.c b/math/aarch64/advsimd/acosf.c
index bb17b1df18f3..e200f792c764 100644
--- a/pl/math/v_acosf_1u4.c
+++ b/math/aarch64/advsimd/acosf.c
@@ -1,14 +1,14 @@
/*
* Single-precision vector acos(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -57,8 +57,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The largest observed error in this region is 1.32 ulps,
_ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
- want 0x1.feb32ep-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
+ want 0x1.feb32ep-1. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -102,12 +102,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
return vfmaq_f32 (add, mul, y);
}
-PL_SIG (V, F, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_F1 (acos), 0.82)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
+HALF_WIDTH_ALIAS_F1 (acos)
+
+TEST_SIG (V, F, 1, acos, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (acos), 0.82)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
diff --git a/pl/math/v_acosh_3u5.c b/math/aarch64/advsimd/acosh.c
index 42fa2616d562..55d8ed5a421e 100644
--- a/pl/math/v_acosh_3u5.c
+++ b/math/aarch64/advsimd/acosh.c
@@ -1,12 +1,12 @@
/*
- * Single-precision vector acosh(x) function.
- * Copyright (c) 2023, Arm Limited.
+ * Double-precision vector acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define WANT_V_LOG1P_K0_SHORTCUT 1
#include "v_log1p_inline.h"
@@ -45,9 +45,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
#endif
- float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
- float64x2_t y;
- y = vaddq_f64 (x, v_f64 (1));
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
+ float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
y = vmulq_f64 (y, xm1);
y = vsqrtq_f64 (y);
y = vaddq_f64 (xm1, y);
@@ -57,10 +56,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
return log1p_inline (y, &d->log1p_consts);
}
-PL_SIG (V, D, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (acosh), 2.53)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
-PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
+TEST_SIG (V, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (V_NAME_D1 (acosh), 2.53)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
+TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
diff --git a/pl/math/v_acoshf_3u1.c b/math/aarch64/advsimd/acoshf.c
index a2ff0f02635b..029d457cfa8a 100644
--- a/pl/math/v_acoshf_3u1.c
+++ b/math/aarch64/advsimd/acoshf.c
@@ -1,49 +1,46 @@
/*
* Single-precision vector acosh(x) function.
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "v_log1pf_inline.h"
+#define SquareLim 0x1p64
+
const static struct data
{
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
- uint16x4_t thresh;
-} data = {
- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
- .one = V4 (0x3f800000),
- .thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */
-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
-#define SignMask 0x80000000
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
- const struct v_log1pf_data d)
+ const struct v_log1pf_data *d)
{
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
/* Vector approximation for single-precision acosh, based on log1p. Maximum
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
- is 2.78 ULP:
- __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
- want 0x1.ef9ea2p-3.
+ is 3.00 ULP:
+ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+ want 0x1.ef0a7cp-4.
With exceptions disabled, we can compute u with a shorter dependency chain,
- which gives maximum error of 3.07 ULP:
- __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
- want 0x1.fbc7f4p-4. */
+ which gives maximum error of 3.22 ULP:
+ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+ want 0x1.fdcdd2p-5. */
-VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
#if WANT_SIMD_EXCEPT
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@@ -54,25 +51,28 @@ VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
#else
- float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
- float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+ float32x4_t u
+ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
#endif
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
if (unlikely (v_any_u16h (special)))
- return special_case (x, y, special, d->log1pf_consts);
- return log1pf_inline (y, d->log1pf_consts);
+ return special_case (x, y, special, &d->log1pf_consts);
+ return log1pf_inline (y, &d->log1pf_consts);
}
-PL_SIG (V, F, 1, acosh, 1.0, 10.0)
+HALF_WIDTH_ALIAS_F1 (acosh)
+
+TEST_SIG (V, F, 1, acosh, 1.0, 10.0)
#if WANT_SIMD_EXCEPT
-PL_TEST_ULP (V_NAME_F1 (acosh), 2.29)
+TEST_ULP (V_NAME_F1 (acosh), 2.50)
#else
-PL_TEST_ULP (V_NAME_F1 (acosh), 2.58)
+TEST_ULP (V_NAME_F1 (acosh), 2.78)
#endif
-PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
-PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
+TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
+TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
diff --git a/pl/math/v_asin_3u.c b/math/aarch64/advsimd/asin.c
index 756443c6b320..c751d9264a12 100644
--- a/pl/math/v_asin_3u.c
+++ b/math/aarch64/advsimd/asin.c
@@ -1,36 +1,35 @@
/*
* Double-precision vector asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
- float64x2_t poly[12];
+ float64x2_t c0, c2, c4, c6, c8, c10;
float64x2_t pi_over_2;
uint64x2_t abs_mask;
+ double c1, c3, c5, c7, c9, c11;
} data = {
/* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
- .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
- V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
- V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
- V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
- V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
- V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
- .pi_over_2 = V2 (0x1.921fb54442d18p+0),
- .abs_mask = V2 (0x7fffffffffffffff),
+ .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4,
+ .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6,
+ .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6,
+ .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7,
+ .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6,
+ .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6,
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
};
#define AllMask v_u64 (0xffffffffffffffff)
-#define One (0x3ff0000000000000)
-#define Small (0x3e50000000000000) /* 2^-12. */
+#define One 0x3ff0000000000000
+#define Small 0x3e50000000000000 /* 2^-12. */
#if WANT_SIMD_EXCEPT
static float64x2_t VPCS_ATTR NOINLINE
@@ -58,12 +57,11 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
The largest observed error in this region is 2.69 ulps,
- _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
- want 0x1.110d7e85fdd53p-1. */
+ _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+ want 0x1.1111dd54ddf99p-1. */
float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
-
float64x2_t ax = vabsq_f64 (x);
#if WANT_SIMD_EXCEPT
@@ -76,7 +74,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
return special_case (x, x, AllMask);
#endif
- uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
+ uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
/* Evaluate polynomial Q(x) = y + y * z * P(z) with
z = x ^ 2 and y = |x| , if |x| < 0.5
@@ -89,7 +87,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
float64x2_t z4 = vmulq_f64 (z2, z2);
float64x2_t z8 = vmulq_f64 (z4, z4);
float64x2_t z16 = vmulq_f64 (z8, z8);
- float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+ /* order-11 estrin. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+ float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+ float64x2_t p = vfmaq_f64 (p07, z16, p811);
/* Finalize polynomial: z + z * z2 * P(z2). */
p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
@@ -102,12 +119,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
return vbslq_f64 (d->abs_mask, y, x);
}
-PL_SIG (V, D, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_D1 (asin), 2.19)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
+TEST_SIG (V, D, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_D1 (asin), 2.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
+TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
diff --git a/pl/math/v_asinf_2u5.c b/math/aarch64/advsimd/asinf.c
index eb978cd956ab..970feb37e1d5 100644
--- a/pl/math/v_asinf_2u5.c
+++ b/math/aarch64/advsimd/asinf.c
@@ -1,14 +1,14 @@
/*
* Single-precision vector asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The largest observed error in this region is 2.41 ulps,
_ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -93,12 +93,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
return vbslq_f32 (v_u32 (AbsMask), y, x);
}
-PL_SIG (V, F, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_F1 (asin), 1.91)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
+HALF_WIDTH_ALIAS_F1 (asin)
+
+TEST_SIG (V, F, 1, asin, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (asin), 1.91)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
diff --git a/math/aarch64/advsimd/asinh.c b/math/aarch64/advsimd/asinh.c
new file mode 100644
index 000000000000..550302826bd9
--- /dev/null
+++ b/math/aarch64/advsimd/asinh.c
@@ -0,0 +1,242 @@
+/*
+ * Double-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "test_defs.h"
+#include "test_sig.h"
+#include "v_math.h"
+
+const static struct data
+{
+ uint64x2_t huge_bound, abs_mask, off, mask;
+#if WANT_SIMD_EXCEPT
+ float64x2_t tiny_bound;
+#endif
+ float64x2_t lc0, lc2;
+ double lc1, lc3, ln2, lc4;
+
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
+ double c1, c3, c5, c7, c9, c11, c13, c15;
+
+} data = {
+
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V2 (0x1p-26),
+#endif
+ /* Even terms of polynomial s.t. asinh(x) is approximated by
+ asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+ Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
+
+ .c0 = V2 (-0x1.55555555554a7p-3),
+ .c1 = 0x1.3333333326c7p-4,
+ .c2 = V2 (-0x1.6db6db68332e6p-5),
+ .c3 = 0x1.f1c71b26fb40dp-6,
+ .c4 = V2 (-0x1.6e8b8b654a621p-6),
+ .c5 = 0x1.1c4daa9e67871p-6,
+ .c6 = V2 (-0x1.c9871d10885afp-7),
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c8 = V2 (-0x1.3ddca533e9f54p-7),
+ .c9 = 0x1.0becef748dafcp-7,
+ .c10 = V2 (-0x1.b90c7099dd397p-8),
+ .c11 = 0x1.541f2bb1ffe51p-8,
+ .c12 = V2 (-0x1.d217026a669ecp-9),
+ .c13 = 0x1.0b5c7977aaf7p-9,
+ .c14 = V2 (-0x1.e0f37daef9127p-11),
+ .c15 = 0x1.388b5fe542a6p-12,
+ .c16 = V2 (-0x1.021a48685e287p-14),
+ .c17 = V2 (0x1.93d4ba83d34dap-18),
+
+ .lc0 = V2 (-0x1.ffffffffffff7p-2),
+ .lc1 = 0x1.55555555170d4p-2,
+ .lc2 = V2 (-0x1.0000000399c27p-2),
+ .lc3 = 0x1.999b2e90e94cap-3,
+ .lc4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+
+ .off = V2 (0x3fe6900900000000),
+ .huge_bound = V2 (0x5fe0000000000000),
+ .abs_mask = V2 (0x7fffffffffffffff),
+ .mask = V2 (0xfffULL << 52),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
+ uint64x2_t special)
+{
+ /* Copy sign. */
+ y = vbslq_f64 (abs_mask, y, x);
+ return v_call_f64 (asinh, x, y, special);
+}
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static inline float64x2_t
+log_inline (float64x2_t xm, const struct data *d)
+{
+
+ uint64x2_t u = vreinterpretq_u64_f64 (xm);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
+ y = vfmaq_f64 (p, r2, y);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+/* Double-precision implementation of vector asinh(x).
+ asinh is very sensitive around 1, so it is impractical to devise a single
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
+ Instead we use two different algorithms:
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
+ shared with the scalar routine. The greatest observed error 2.79 ULP, in
+ |x| >= 1:
+ _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
+ want 0x1.ffffd003219ddp-1. */
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t ax = vabsq_f64 (x);
+
+ uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+ uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
+ uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+ special = vorrq_u64 (special, tiny);
+#else
+ uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
+#endif
+
+ /* Option 1: |x| >= 1.
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+ If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+ overflow, by setting special lanes to 1. These will be fixed later. */
+ float64x2_t option_1 = v_f64 (0);
+ if (likely (v_any_u64 (gt1)))
+ {
+#if WANT_SIMD_EXCEPT
+ float64x2_t xm = v_zerofy_f64 (ax, special);
+#else
+ float64x2_t xm = ax;
+#endif
+ option_1 = log_inline (
+ vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+ }
+
+ /* Option 2: |x| < 1.
+ Compute asinh(x) using a polynomial.
+ If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+ overflow, and tiny lanes, which will underflow, by setting them to 0. They
+ will be fixed later, either by selecting x or falling back to the scalar
+ special-case. The largest observed error in this region is 1.47 ULPs:
+ _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+ want 0x1.c1d6bf874019cp-1. */
+ float64x2_t option_2 = v_f64 (0);
+
+ if (likely (v_any_u64 (vceqzq_u64 (gt1))))
+ {
+
+#if WANT_SIMD_EXCEPT
+ ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+#endif
+ float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
+ /* Order-17 Pairwise Horner scheme. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
+ float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
+
+ float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
+ p = vfmaq_f64 (p1213, z2, p);
+ p = vfmaq_f64 (p1011, z2, p);
+ p = vfmaq_f64 (p89, z2, p);
+
+ p = vfmaq_f64 (p67, z2, p);
+ p = vfmaq_f64 (p45, z2, p);
+
+ p = vfmaq_f64 (p23, z2, p);
+
+ p = vfmaq_f64 (p01, z2, p);
+ option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
+#if WANT_SIMD_EXCEPT
+ option_2 = vbslq_f64 (tiny, x, option_2);
+#endif
+ }
+
+ /* Choose the right option for each lane. */
+ float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+ if (unlikely (v_any_u64 (special)))
+ {
+ return special_case (x, y, d->abs_mask, special);
+ }
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
+
+TEST_SIG (V, D, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (asinh), 2.29)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+ Ensures the v_sel is choosing the right option in all cases. */
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2)
+TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600)
diff --git a/math/aarch64/advsimd/asinhf.c b/math/aarch64/advsimd/asinhf.c
new file mode 100644
index 000000000000..6a96f6ee9f4b
--- /dev/null
+++ b/math/aarch64/advsimd/asinhf.c
@@ -0,0 +1,89 @@
+/*
+ * Single-precision vector asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ float32x4_t one;
+ uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .one = V4 (1),
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+ uint32x4_t special, const struct data *d)
+{
+ return v_call_f32 (
+ asinhf, x,
+ vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+ special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+ Worst-case error is 2.59 ULP:
+ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+ want 0x1.d449c4p-3. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+ float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+ /* Sidestep tiny and large values to avoid inadvertently triggering
+ under/overflow. */
+ special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
+ if (unlikely (v_any_u32 (special)))
+ {
+ ax = v_zerofy_f32 (ax, special);
+ x = v_zerofy_f32 (x, special);
+ }
+#endif
+
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
+ float32x4_t d
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (special_arg, sign, y, special, dat);
+ return vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+}
+
+HALF_WIDTH_ALIAS_F1 (asinh)
+
+TEST_SIG (V, F, 1, asinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (asinh), 2.10)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
+TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
diff --git a/pl/math/v_atan_2u5.c b/math/aarch64/advsimd/atan.c
index ba68cc3cc720..26d264321068 100644
--- a/pl/math/v_atan_2u5.c
+++ b/math/aarch64/advsimd/atan.c
@@ -1,32 +1,32 @@
/*
* Double-precision vector atan(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
- float64x2_t poly[20];
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-1022, 1.0]. */
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
};
@@ -42,6 +42,11 @@ static const struct data
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
/* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need
@@ -80,9 +85,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
- float64x2_t y
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
- v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t y = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
@@ -93,12 +124,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
return y;
}
-PL_SIG (V, D, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (atan), 1.78)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
-PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
+TEST_SIG (V, D, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (atan), 1.78)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
+TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
+TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
diff --git a/math/aarch64/advsimd/atan2.c b/math/aarch64/advsimd/atan2.c
new file mode 100644
index 000000000000..18c4b70b92f6
--- /dev/null
+++ b/math/aarch64/advsimd/atan2.c
@@ -0,0 +1,171 @@
+/*
+ * Double-precision vector atan2(x) function.
+ *
+ * Copyright (c) 2021-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+ float64x2_t pi_over_2;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+ uint64x2_t zeroinfnan, minustwo;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .c0 = V2 (-0x1.5555555555555p-2),
+ .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3),
+ .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4),
+ .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4),
+ .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5),
+ .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5),
+ .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5),
+ .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6),
+ .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+ .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13),
+ .c19 = 0x1.358851160a528p-16,
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+ .minustwo = V2 (0xc000000000000000),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+ uint64x2_t sign_xy, uint64x2_t cmp)
+{
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+ return v_call2_f64 (atan2, y, x, ret, cmp);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i, const struct data *d)
+{
+ /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
+}
+
+/* Fast implementation of vector atan2.
+ Maximum observed error is 2.8 ulps:
+ _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+ got 0x1.92d628ab678ccp-1
+ want 0x1.92d628ab678cfp-1. */
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t iy = vreinterpretq_u64_f64 (y);
+
+ uint64x2_t special_cases
+ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+ uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+ uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+ uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
+
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+ uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
+
+ /* Set up z for call to atan. */
+ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+ float64x2_t z = vdivq_f64 (n, q);
+
+ /* Work out the correct shift. */
+ float64x2_t shift
+ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
+ shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+ shift = vmulq_f64 (shift, d->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+ ret = vaddq_f64 (ret, shift);
+
+ if (unlikely (v_any_u64 (special_cases)))
+ return special_case (y, x, ret, sign_xy, special_cases);
+
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+
+ return ret;
+}
+
+/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
+TEST_SIG (V, D, 2, atan2)
+// TODO tighten this once __v_atan2 is fixed
+TEST_ULP (V_NAME_D2 (atan2), 2.9)
+TEST_DISABLE_FENV (V_NAME_D2 (atan2))
+TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
diff --git a/pl/math/v_atan2f_3u.c b/math/aarch64/advsimd/atan2f.c
index bbfc3cb552f6..632014249ab0 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/math/aarch64/advsimd/atan2f.c
@@ -1,59 +1,64 @@
/*
* Single-precision vector atan2(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
- float32x4_t poly[8];
- float32x4_t pi_over_2;
+ float32x4_t c0, pi_over_2, c4, c6, c2;
+ float c1, c3, c5, c7;
+ uint32x4_t comp_const;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-128, 1.0].
Generated using fpminimax between FLT_MIN and 1. */
- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
- .pi_over_2 = V4 (0x1.921fb6p+0f),
+ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
+ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
+ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
+ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
+ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
};
#define SignMask v_u32 (0x80000000)
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+ uint32x4_t sign_xy, uint32x4_t cmp)
{
+ /* Account for the sign of y. */
+ ret = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
return v_call2_f32 (atan2f, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint32x4_t
-zeroinfnan (uint32x4_t i)
+zeroinfnan (uint32x4_t i, const struct data *d)
{
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
- v_u32 (2 * 0x7f800000lu - 1));
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
}
/* Fast implementation of vector atan2f. Maximum observed error is
2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
_ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
want 0x1.967f00p-1. */
-float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{
- const struct data *data_ptr = ptr_barrier (&data);
+ const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t iy = vreinterpretq_u32_f32 (y);
- uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
+ uint32x4_t special_cases
+ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint32x4_t sign_x = vandq_u32 (ix, SignMask);
uint32x4_t sign_y = vandq_u32 (iy, SignMask);
@@ -67,14 +72,14 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
/* Set up z for call to atanf. */
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
- float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
- float32x4_t z = vdivq_f32 (n, d);
+ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+ float32x4_t z = vdivq_f32 (n, q);
/* Work out the correct shift. */
float32x4_t shift = vreinterpretq_f32_u32 (
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
- shift = vmulq_f32 (shift, data_ptr->pi_over_2);
+ shift = vmulq_f32 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
@@ -86,30 +91,37 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t z2 = vmulq_f32 (z, z);
float32x4_t z4 = vmulq_f32 (z2, z2);
- float32x4_t ret = vfmaq_f32 (
- v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
+ float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+ float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+
+ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
/* y = shift + z * P(z^2). */
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
- /* Account for the sign of y. */
- ret = vreinterpretq_f32_u32 (
- veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
-
if (unlikely (v_any_u32 (special_cases)))
{
- return special_case (y, x, ret, special_cases);
+ return special_case (y, x, ret, sign_xy, special_cases);
}
- return ret;
+ /* Account for the sign of y. */
+ return vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
}
+HALF_WIDTH_ALIAS_F2 (atan2)
+
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
-PL_SIG (V, F, 2, atan2)
-PL_TEST_ULP (V_NAME_F2 (atan2), 2.46)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
+TEST_SIG (V, F, 2, atan2)
+TEST_DISABLE_FENV (V_NAME_F2 (atan2))
+TEST_ULP (V_NAME_F2 (atan2), 2.46)
+TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
+TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
diff --git a/pl/math/v_atanf_3u.c b/math/aarch64/advsimd/atanf.c
index f522d957c1cc..61927c9b261a 100644
--- a/pl/math/v_atanf_3u.c
+++ b/math/aarch64/advsimd/atanf.c
@@ -1,14 +1,14 @@
/*
* Single-precision vector atan(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
static const struct data
{
@@ -43,7 +43,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
_ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -98,10 +98,12 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
return y;
}
-PL_SIG (V, F, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (atan), 2.5)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
+HALF_WIDTH_ALIAS_F1 (atan)
+
+TEST_SIG (V, F, 1, atan, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (atan), 2.5)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
diff --git a/pl/math/v_atanh_3u5.c b/math/aarch64/advsimd/atanh.c
index f282826a3f32..c2f9585dd29b 100644
--- a/pl/math/v_atanh_3u5.c
+++ b/math/aarch64/advsimd/atanh.c
@@ -1,13 +1,13 @@
/*
* Double-precision vector atanh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define WANT_V_LOG1P_K0_SHORTCUT 0
#include "v_log1p_inline.h"
@@ -15,15 +15,19 @@
const static struct data
{
struct v_log1p_data log1p_consts;
- uint64x2_t one, half;
+ uint64x2_t one;
+ uint64x2_t sign_mask;
} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
.one = V2 (0x3ff0000000000000),
- .half = V2 (0x3fe0000000000000) };
+ .sign_mask = V2 (0x8000000000000000) };
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+ uint64x2_t special, const struct data *d)
{
- return v_call_f64 (atanh, x, y, special);
+ y = log1p_inline (y, &d->log1p_consts);
+ return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
+ vmulq_f64 (halfsign, y), special);
}
/* Approximation for vector double-precision atanh(x) using modified log1p.
@@ -35,11 +39,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
+ float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
float64x2_t ax = vabsq_f64 (x);
uint64x2_t ia = vreinterpretq_u64_f64 (ax);
- uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
uint64x2_t special = vcgeq_u64 (ia, d->one);
- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
#if WANT_SIMD_EXCEPT
ax = v_zerofy_f64 (ax, special);
@@ -47,20 +50,26 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
float64x2_t y;
y = vaddq_f64 (ax, ax);
- y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
- y = log1p_inline (y, &d->log1p_consts);
+ y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
if (unlikely (v_any_u64 (special)))
- return special_case (x, vmulq_f64 (y, halfsign), special);
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special, d);
+#else
+ return special_case (ax, halfsign, y, special, d);
+#endif
+
+ y = log1p_inline (y, &d->log1p_consts);
return vmulq_f64 (y, halfsign);
}
-PL_SIG (V, D, 1, atanh, -1.0, 1.0)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
-PL_TEST_ULP (V_NAME_D1 (atanh), 3.32)
+TEST_SIG (V, D, 1, atanh, -1.0, 1.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_D1 (atanh), 3.32)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000)
+TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100)
/* atanh is asymptotic at 1, which is the default control value - have to set
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
control lane is irrelevant if fp exceptions are disabled). */
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0)
+TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0)
diff --git a/pl/math/v_atanhf_3u1.c b/math/aarch64/advsimd/atanhf.c
index f6a5f25eca9a..313d15ca6391 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/math/aarch64/advsimd/atanhf.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector atanh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "v_log1pf_inline.h"
const static struct data
@@ -30,16 +30,18 @@ const static struct data
#define Half v_u32 (0x3f000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+ uint32x4_t special)
{
- return v_call_f32 (atanhf, x, y, special);
+ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+ vmulq_f32 (halfsign, y), special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 3.08 ULP:
- __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
- want 0x1.ffcb82p-5. */
-VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
+ The maximum error is 2.93 ULP:
+ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+ want 0x1.f4dcf8p-5. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -58,20 +60,31 @@ VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
uint32x4_t special = vcgeq_u32 (iax, d->one);
#endif
- float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
- y = log1pf_inline (y, d->log1pf_consts);
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+ y = log1pf_inline (y, &d->log1pf_consts);
+ /* If exceptions not required, pass ax to special-case for shorter dependency
+ chain. If exceptions are required ax will have been zerofied, so have to
+ pass x. */
if (unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special);
+#else
+ return special_case (ax, halfsign, y, special);
+#endif
return vmulq_f32 (halfsign, y);
}
-PL_SIG (V, F, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (V_NAME_F1 (atanh), 2.59)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+HALF_WIDTH_ALIAS_F1 (atanh)
+
+TEST_SIG (V, F, 1, atanh, -1.0, 1.0)
+TEST_ULP (V_NAME_F1 (atanh), 2.44)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000)
+TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000)
/* atanh is asymptotic at 1, which is the default control value - have to set
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
control lane is irrelevant if fp exceptions are disabled). */
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0)
+TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0)
diff --git a/pl/math/v_cbrt_2u.c b/math/aarch64/advsimd/cbrt.c
index cc7cff15dc0f..8e72e5b566fc 100644
--- a/pl/math/v_cbrt_2u.c
+++ b/math/aarch64/advsimd/cbrt.c
@@ -1,14 +1,14 @@
/*
* Double-precision vector cbrt(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f64.h"
const static struct data
{
@@ -40,13 +40,20 @@ special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
}
-/* Approximation for double-precision vector cbrt(x), using low-order polynomial
- and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+/* Approximation for double-precision vector cbrt(x), using low-order
+ polynomial and two Newton iterations.
+
+ The vector version of frexp does not handle subnormals
+ correctly. As a result these need to be handled by the scalar
+ fallback, where accuracy may be worse than that of the vector code
+ path.
+
+ Greatest observed error in the normal range is 1.79 ULP. Errors repeat
according to the exponent, for instance an error observed for double value
m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
integer.
- __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
- want 0x1.965fe72821e99p+0. */
+ _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+ want 0x1.965fe72821e99p+0. */
VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -64,8 +71,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
- /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
- Newton iterations. */
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+ for Newton iterations. */
float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
float64x2_t one_third = d->one_third;
/* Two iterations of Newton's method for iteratively approximating cbrt. */
@@ -84,8 +91,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
- Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
- an integer in [-2, 2], and can be looked up in the table T. Hence the
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
result is assembled as:
cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
@@ -110,7 +117,11 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
return vbslq_f64 (d->abs_mask, y, x);
}
-PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30)
-PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt))
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
+/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
+ has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
+ in the vector path is 1.79 ULP.
+ [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
+ Functions in Single, Double, Double Extended, and Quadruple Precision. */
+TEST_ULP (V_NAME_D1 (cbrt), 3.17)
+TEST_SIG (V, D, 1, cbrt, -10.0, 10.0)
+TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
diff --git a/pl/math/v_cbrtf_1u7.c b/math/aarch64/advsimd/cbrtf.c
index 74918765209f..4e76feb2dd8b 100644
--- a/pl/math/v_cbrtf_1u7.c
+++ b/math/aarch64/advsimd/cbrtf.c
@@ -1,14 +1,14 @@
/*
* Single-precision vector cbrt(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
const static struct data
{
@@ -49,7 +49,7 @@ shifted_lookup (const float *table, int32x4_t i)
0x1.85a2aa and the exponent is a multiple of 3, for example:
_ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
want 0x1.267932p+1. */
-VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
@@ -110,7 +110,8 @@ VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
return vbslq_f32 (SignMask, x, y);
}
-PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt))
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
+HALF_WIDTH_ALIAS_F1 (cbrt)
+
+TEST_SIG (V, F, 1, cbrt, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cbrt), 1.15)
+TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
diff --git a/pl/math/v_cexpi_3u5.c b/math/aarch64/advsimd/cexpi.c
index 5163b15926b8..40ba5ff31f20 100644
--- a/pl/math/v_cexpi_3u5.c
+++ b/math/aarch64/advsimd/cexpi.c
@@ -1,13 +1,13 @@
/*
* Double-precision vector sincos function - return-by-value interface.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_sincos_common.h"
#include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
static float64x2x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
@@ -34,11 +34,13 @@ _ZGVnN2v_cexpi (float64x2_t x)
return sc;
}
-PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
-PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin)
+TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
+TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
#define V_CEXPI_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
+ TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
V_CEXPI_INTERVAL (0, 0x1p23, 500000)
V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
V_CEXPI_INTERVAL (0x1p23, inf, 10000)
diff --git a/pl/math/v_cexpif_1u8.c b/math/aarch64/advsimd/cexpif.c
index 4897018d3090..e55d99653a66 100644
--- a/pl/math/v_cexpif_1u8.c
+++ b/math/aarch64/advsimd/cexpif.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector cexpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_sincosf_common.h"
#include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
static float32x4x2_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
@@ -36,11 +36,13 @@ _ZGVnN4v_cexpif (float32x4_t x)
return sc;
}
-PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
-PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos)
+TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
+TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
#define V_CEXPIF_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
+ TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
V_CEXPIF_INTERVAL (0x1p20, inf, 10000)
diff --git a/math/aarch64/v_cos.c b/math/aarch64/advsimd/cos.c
index 9a73575bce89..9f3de4dd5c36 100644
--- a/math/aarch64/v_cos.c
+++ b/math/aarch64/advsimd/cos.c
@@ -1,17 +1,19 @@
/*
* Double-precision vector cos function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
static const struct data
{
float64x2_t poly[7];
- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
@@ -19,11 +21,9 @@ static const struct data
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
V2 (-0x1.9e9540300a1p-41) },
.inv_pi = V2 (0x1.45f306dc9c883p-2),
- .half_pi = V2 (0x1.921fb54442d18p+0),
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
.range_val = V2 (0x1p23)
};
@@ -57,10 +57,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
- n = vsubq_f64 (n, v_f64 (0.5));
+ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+ n = vsubq_f64 (n, v_f64 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
@@ -85,3 +84,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
+
+TEST_SIG (V, D, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (cos), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000)
diff --git a/math/aarch64/v_cosf.c b/math/aarch64/advsimd/cosf.c
index b9890b2998ad..d2844e44e196 100644
--- a/math/aarch64/v_cosf.c
+++ b/math/aarch64/advsimd/cosf.c
@@ -1,17 +1,19 @@
/*
* Single-precision vector cos function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
static const struct data
{
float32x4_t poly[4];
- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -22,8 +24,6 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
- .half_pi = V4 (0x1.921fb6p0f),
.range_val = V4 (0x1p20f)
};
@@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
return v_call_f32 (cosf, x, y, cmp);
}
-float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, r3, y;
@@ -58,9 +58,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
+ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
n = vsubq_f32 (n, v_f32 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
@@ -80,3 +79,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
+
+HALF_WIDTH_ALIAS_F1 (cos)
+
+TEST_SIG (V, F, 1, cos, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (cos), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000)
diff --git a/pl/math/v_cosh_2u.c b/math/aarch64/advsimd/cosh.c
index 649c390f4622..54407b23aa9d 100644
--- a/pl/math/v_cosh_2u.c
+++ b/math/aarch64/advsimd/cosh.c
@@ -1,18 +1,20 @@
/*
* Double-precision vector cosh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float64x2_t poly[3];
- float64x2_t inv_ln2, ln2, shift, thres;
+ float64x2_t inv_ln2;
+ double ln2[2];
+ float64x2_t shift, thres;
uint64x2_t index_mask, special_bound;
} data = {
.poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
@@ -48,8 +50,9 @@ exp_inline (float64x2_t x)
float64x2_t n = vsubq_f64 (z, d->shift);
/* r = x - n*ln2/N. */
- float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
- r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+ float64x2_t ln2 = vld1q_f64 (d->ln2);
+ float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+ r = vfmaq_laneq_f64 (r, n, ln2, 1);
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
uint64x2_t i = vandq_u64 (u, d->index_mask);
@@ -97,8 +100,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
return vaddq_f64 (half_t, half_over_t);
}
-PL_SIG (V, D, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (cosh), 1.43)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh))
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
+TEST_SIG (V, D, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (cosh), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
diff --git a/pl/math/v_coshf_2u4.c b/math/aarch64/advsimd/coshf.c
index c622b0b183f1..f1ed3e5161fd 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/math/aarch64/advsimd/coshf.c
@@ -1,32 +1,39 @@
/*
* Single-precision vector cosh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_expf_inline.h"
#include "v_math.h"
-#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
struct v_expf_data expf_consts;
- uint32x4_t tiny_bound, special_bound;
+ uint32x4_t tiny_bound;
+ float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special_bound;
+#endif
} data = {
.expf_consts = V_EXPF_DATA,
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
.special_bound = V4 (0x42ad496c),
+#endif
};
#if !WANT_SIMD_EXCEPT
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+ uint32x4_t special)
{
- return v_call_f32 (coshf, x, y, special);
+ return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
}
#endif
@@ -34,18 +41,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
Maximum error is 2.38 ULP:
_ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
want 0x1.6a4922p+4. */
-float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
-
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered correctly, fall back to the scalar
variant for all inputs if any input is a special value or above the bound
at which expf overflows. */
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
if (unlikely (v_any_u32 (special)))
return v_call_f32 (coshf, x, x, v_u32 (-1));
@@ -54,10 +60,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
input to 0, which will generate no exceptions. */
if (unlikely (v_any_u32 (tiny)))
ax = v_zerofy_f32 (ax, tiny);
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+ uint32x4_t special = vcageq_f32 (x, d->bound);
+ float32x4_t t = v_expf_inline (x, &d->expf_consts);
#endif
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- float32x4_t t = v_expf_inline (ax, &d->expf_consts);
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
@@ -66,15 +75,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
#else
if (unlikely (v_any_u32 (special)))
- return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+ return special_case (x, half_t, half_over_t, special);
#endif
return vaddq_f32 (half_t, half_over_t);
}
-PL_SIG (V, F, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (cosh), 1.89)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
+HALF_WIDTH_ALIAS_F1 (cosh)
+
+TEST_SIG (V, F, 1, cosh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (cosh), 1.89)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/v_cospi_3u1.c b/math/aarch64/advsimd/cospi.c
index 3c2ee0b74c8e..e63201a55786 100644
--- a/pl/math/v_cospi_3u1.c
+++ b/math/aarch64/advsimd/cospi.c
@@ -1,15 +1,15 @@
/*
* Double-precision vector cospi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -31,7 +31,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
- return v_call_f64 (cospi, x, y, cmp);
+ return v_call_f64 (arm_math_cospi, x, y, cmp);
}
/* Approximation for vector double-precision cospi(x).
@@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
-PL_SIG (V, D, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_D1 (cospi), 2.56)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (cospi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
+#endif
diff --git a/pl/math/v_cospif_3u2.c b/math/aarch64/advsimd/cospif.c
index d88aa828439d..62f4b8122b2c 100644
--- a/pl/math/v_cospif_3u2.c
+++ b/math/aarch64/advsimd/cospif.c
@@ -1,15 +1,15 @@
/*
* Single-precision vector cospi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -26,14 +26,14 @@ static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
{
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
- return v_call_f32 (cospif, x, y, cmp);
+ return v_call_f32 (arm_math_cospif, x, y, cmp);
}
/* Approximation for vector single-precision cospi(x)
Maximum Error: 3.17 ULP:
_ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
want 0x1.f7cd5p-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -74,10 +74,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
-PL_SIG (V, F, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_F1 (cospi), 2.67)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
+HALF_WIDTH_ALIAS_F1 (cospi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (cospi), 2.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
+#endif
diff --git a/pl/math/v_erf_2u5.c b/math/aarch64/advsimd/erf.c
index e581ec5bb8a7..40717a660ce2 100644
--- a/pl/math/v_erf_2u5.c
+++ b/math/aarch64/advsimd/erf.c
@@ -1,30 +1,32 @@
/*
* Double-precision vector erf(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float64x2_t third;
- float64x2_t tenth, two_over_five, two_over_fifteen;
- float64x2_t two_over_nine, two_over_fortyfive;
+ float64x2_t tenth, two_over_five, two_over_nine;
+ double two_over_fifteen, two_over_fortyfive;
float64x2_t max, shift;
+ uint64x2_t max_idx;
#if WANT_SIMD_EXCEPT
float64x2_t tiny_bound, huge_bound, scale_minus_one;
#endif
} data = {
+ .max_idx = V2 (768),
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
- .two_over_fifteen = V2 (0x1.1111111111111p-3),
+ .two_over_fifteen = 0x1.1111111111111p-3,
.tenth = V2 (-0x1.999999999999ap-4),
.two_over_five = V2 (-0x1.999999999999ap-2),
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
- .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
.max = V2 (5.9921875), /* 6 - 1/128. */
.shift = V2 (0x1p45),
#if WANT_SIMD_EXCEPT
@@ -46,8 +48,8 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
- e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
+ float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+ e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
e.erf = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@@ -77,8 +79,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t a = vabsq_f64 (x);
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
to return expected results. */
- uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
- uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
#if WANT_SIMD_EXCEPT
/* |x| huge or tiny. */
@@ -105,7 +107,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
segfault. */
uint64x2_t i
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
- i = vbslq_u64 (a_le_max, i, v_u64 (768));
+ i = vbslq_u64 (a_le_max, i, dat->max_idx);
struct entry e = lookup (i);
float64x2_t r = vsubq_f64 (z, shift);
@@ -115,14 +117,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t d2 = vmulq_f64 (d, d);
float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t two_over_fifteen_and_fortyfive
+ = vld1q_f64 (&dat->two_over_fifteen);
+
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
float64x2_t p1 = r;
float64x2_t p2
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
- float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+ two_over_fifteen_and_fortyfive, 0);
p4 = vfmsq_f64 (dat->tenth, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+ two_over_fifteen_and_fortyfive, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
@@ -150,9 +157,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
return y;
}
-PL_SIG (V, D, 1, erf, -6.0, 6.0)
-PL_TEST_ULP (V_NAME_D1 (erf), 1.79)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
+TEST_SIG (V, D, 1, erf, -6.0, 6.0)
+TEST_ULP (V_NAME_D1 (erf), 1.79)
+/* WANT_SIMD_EXCEPT blocks miss some cases. */
+TEST_DISABLE_FENV (V_NAME_D1 (erf))
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
diff --git a/pl/math/v_erfc_1u8.c b/math/aarch64/advsimd/erfc.c
index 10ef7e6a3c34..97ef09ecc113 100644
--- a/pl/math/v_erfc_1u8.c
+++ b/math/aarch64/advsimd/erfc.c
@@ -1,21 +1,21 @@
/*
* Double-precision vector erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
- float64x2_t p20, p40, p41, p42;
- float64x2_t p51, p52;
- float64x2_t qr5, qr6, qr7, qr8, qr9;
+ float64x2_t p20, p40, p41, p51;
+ double p42, p52;
+ double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
#endif
@@ -30,9 +30,9 @@ static const struct data
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@@ -57,8 +57,10 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
- e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
+ float64x2_t e1
+ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+ float64x2_t e2
+ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
e.erfc = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@@ -144,22 +146,26 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
p4 = vfmsq_f64 (dat->p40, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
- float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
- p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
- float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
- p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
- float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
- p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
- float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
- p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
- float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
- p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
+ float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+ qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+ qr9 = vld1q_f64 (dat->qr9);
+ float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+ p6 = vmulq_laneq_f64 (p6, qr5, 1);
+ float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+ p7 = vmulq_laneq_f64 (p7, qr6, 1);
+ float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+ p8 = vmulq_laneq_f64 (p8, qr7, 1);
+ float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+ p9 = vmulq_laneq_f64 (p9, qr8, 1);
+ float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+ p10 = vmulq_laneq_f64 (p10, qr9, 1);
/* Compute polynomial in d using pairwise Horner scheme. */
float64x2_t p90 = vfmaq_f64 (p9, d, p10);
float64x2_t p78 = vfmaq_f64 (p7, d, p8);
@@ -189,10 +195,11 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
return vfmaq_f64 (off, fac, y);
}
-PL_SIG (V, D, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (V_NAME_D1 (erfc), 1.21)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
+TEST_SIG (V, D, 1, erfc, -6.0, 28.0)
+TEST_ULP (V_NAME_D1 (erfc), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
diff --git a/pl/math/v_erfcf_1u7.c b/math/aarch64/advsimd/erfcf.c
index c361d0704438..f420439ef8a3 100644
--- a/pl/math/v_erfcf_1u7.c
+++ b/math/aarch64/advsimd/erfcf.c
@@ -1,19 +1,20 @@
/*
* Single-precision vector erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
uint32x4_t offset, table_scale;
float32x4_t max, shift;
- float32x4_t coeffs, third, two_over_five, tenth;
+ float coeffs[4];
+ float32x4_t third, two_over_five, tenth;
#if WANT_SIMD_EXCEPT
float32x4_t uflow_bound;
#endif
@@ -27,7 +28,7 @@ static const struct data
.shift = V4 (0x1p17f),
/* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
fmas. */
- .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+ .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
.third = V4 (0x1.555556p-2f),
.two_over_five = V4 (-0x1.99999ap-2f),
.tenth = V4 (-0x1.99999ap-4f),
@@ -50,12 +51,16 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
- float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
- float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
- float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
- float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
- float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
- float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+ float32x2_t t0
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+ float32x2_t t1
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+ float32x2_t t2
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+ float32x2_t t3
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+ float32x4_t e1 = vcombine_f32 (t0, t1);
+ float32x4_t e2 = vcombine_f32 (t2, t3);
e.erfc = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
@@ -86,8 +91,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
_ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
want 0x1.f51216p-120. */
-VPCS_ATTR
-float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
+NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
@@ -130,10 +134,11 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p1 = r;
- float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
+ float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+ float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
float32x4_t p3
- = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
- float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
+ = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+ float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
p4 = vfmsq_f32 (dat->tenth, r2, p4);
float32x4_t y = vfmaq_f32 (p3, d, p4);
@@ -157,10 +162,13 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
return vfmaq_f32 (off, fac, y);
}
-PL_SIG (V, F, 1, erfc, -4.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (erfc), 1.14)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
+HALF_WIDTH_ALIAS_F1 (erfc)
+
+TEST_SIG (V, F, 1, erfc, -4.0, 10.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT)
+TEST_ULP (V_NAME_F1 (erfc), 1.14)
+TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
+TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
diff --git a/pl/math/v_erff_2u.c b/math/aarch64/advsimd/erff.c
index 502526407df2..508bc4c2f5e2 100644
--- a/pl/math/v_erff_2u.c
+++ b/math/aarch64/advsimd/erff.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector erf(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -37,12 +37,12 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
- float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
- float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
- float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
- float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
- float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
- float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+ float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+ float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+ float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+ float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+ float32x4_t e1 = vcombine_f32 (t0, t1);
+ float32x4_t e2 = vcombine_f32 (t2, t3);
e.erf = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
@@ -61,7 +61,7 @@ lookup (uint32x4_t i)
Maximum error: 1.93 ULP
_ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
want 0x1.fd6868p-9. */
-float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
@@ -110,9 +110,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
return y;
}
-PL_SIG (V, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (V_NAME_F1 (erf), 1.43)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
+HALF_WIDTH_ALIAS_F1 (erf)
+
+TEST_SIG (V, F, 1, erf, -4.0, 4.0)
+TEST_ULP (V_NAME_F1 (erf), 1.43)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
+TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
diff --git a/math/aarch64/v_exp.c b/math/aarch64/advsimd/exp.c
index bc5609faf4fc..a928c35c9418 100644
--- a/math/aarch64/v_exp.c
+++ b/math/aarch64/advsimd/exp.c
@@ -1,12 +1,14 @@
/*
* Double-precision vector e^x function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define N (1 << V_EXP_TABLE_BITS)
#define IndexMask (N - 1)
@@ -123,3 +125,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
return vfmaq_f64 (s, y, s);
}
+
+TEST_SIG (V, D, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp), 1.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000)
diff --git a/pl/math/v_exp10_2u.c b/math/aarch64/advsimd/exp10.c
index 29072a60fb3a..24fdd1c7d257 100644
--- a/pl/math/v_exp10_2u.c
+++ b/math/aarch64/advsimd/exp10.c
@@ -1,14 +1,15 @@
/*
* Double-precision vector 10^x function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#define _GNU_SOURCE
#include "mathlib.h"
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Value of |x| above which scale overflows without special treatment. */
#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
@@ -135,10 +136,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
return vfmaq_f64 (s, y, s);
}
-PL_SIG (S, D, 1, exp10, -9.9, 9.9)
-PL_SIG (V, D, 1, exp10, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_D1 (exp10), 1.15)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
+#if WANT_EXP10_TESTS
+TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, D, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp10), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
+#endif
diff --git a/pl/math/v_exp10f_2u4.c b/math/aarch64/advsimd/exp10f.c
index 0e91becfa612..eb0d5dd0d57c 100644
--- a/pl/math/v_exp10f_2u4.c
+++ b/math/aarch64/advsimd/exp10f.c
@@ -1,23 +1,24 @@
/*
* Single-precision vector 10^x function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "mathlib.h"
+#define _GNU_SOURCE
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
#define ScaleBound 192.0f
static const struct data
{
- float32x4_t poly[5];
- float32x4_t log10_2_and_inv, shift;
-
+ float32x4_t c0, c1, c3;
+ float log10_2_high, log10_2_low, c2, c4;
+ float32x4_t inv_log10_2, special_bound;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
#endif
@@ -27,19 +28,24 @@ static const struct data
rel error: 0x1.89dafa3p-24
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
maxerr: 1.85943 +0.5 ulp. */
- .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
- V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
- .shift = V4 (0x1.8p23f),
-
- /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
- .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
+ .c0 = V4 (0x1.26bb16p+1f),
+ .c1 = V4 (0x1.5350d2p+1f),
+ .c2 = 0x1.04744ap+1f,
+ .c3 = V4 (0x1.2d8176p+0f),
+ .c4 = 0x1.12b41ap-1f,
+ .inv_log10_2 = V4 (0x1.a934fp+1),
+ .log10_2_high = 0x1.344136p-2,
+ .log10_2_low = 0x1.ec10cp-27,
+ /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+ .special_bound = V4 (126.0f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound)
#endif
};
-#define ExponentBias v_u32 (0x3f800000)
-
#if WANT_SIMD_EXCEPT
# define SpecialBound 38.0f /* rint(log10(2^127)). */
@@ -57,17 +63,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
+# define SpecialBound 126.0f
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@@ -84,7 +88,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
Algorithm is accurate to 2.36 ULP.
_ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
want 0x1.7e79cp+11. */
-float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
#if WANT_SIMD_EXCEPT
@@ -102,22 +106,23 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
- float32x4_t n = vsubq_f32 (z, d->shift);
- float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
- r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+ r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t poly
- = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
- v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+ float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+ float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
@@ -129,10 +134,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
-PL_SIG (S, F, 1, exp10, -9.9, 9.9)
-PL_SIG (V, F, 1, exp10, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_F1 (exp10), 1.86)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
+HALF_WIDTH_ALIAS_F1 (exp10)
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, F, 1, exp10, -9.9, 9.9)
+TEST_SIG (V, F, 1, exp10, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp10), 1.86)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
+#endif
diff --git a/pl/math/v_exp2_2u.c b/math/aarch64/advsimd/exp2.c
index de59779689f5..63448d806b82 100644
--- a/pl/math/v_exp2_2u.c
+++ b/math/aarch64/advsimd/exp2.c
@@ -1,19 +1,20 @@
/*
* Double-precision vector 2^x function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define N (1 << V_EXP_TABLE_BITS)
#define IndexMask (N - 1)
#define BigBound 1022.0
#define UOFlowBound 1280.0
+#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
static const struct data
{
@@ -38,7 +39,6 @@ lookup_sbits (uint64x2_t i)
#if WANT_SIMD_EXCEPT
-# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
/* Call scalar exp2 as a fallback. */
@@ -62,8 +62,8 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n,
/* 2^(n/N) may overflow, break it up into s1*s2. */
uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
- float64x2_t s2 = vreinterpretq_f64_u64 (
- vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
float64x2_t r1 = vmulq_f64 (s1, s1);
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
@@ -119,10 +119,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
return vfmaq_f64 (s, s, y);
}
-PL_SIG (V, D, 1, exp2, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_D1 (exp2), 1.15)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
+TEST_SIG (V, D, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (exp2), 1.15)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/advsimd/exp2f.c
index e402205e98e6..40f6170d3702 100644
--- a/math/aarch64/v_exp2f.c
+++ b/math/aarch64/advsimd/exp2f.c
@@ -1,33 +1,38 @@
/*
* Single-precision vector 2^x function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "mathlib.h"
#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
static const struct data
{
- float32x4_t poly[5];
- uint32x4_t exponent_bias;
+ float32x4_t c1, c3;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
- float32x4_t special_bound, scale_thresh;
+ float32x4_t scale_thresh, special_bound;
#endif
+ float c0, c2, c4, zero;
} data = {
/* maxerr: 1.962 ulp. */
- .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
- V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+ .c0 = 0x1.59977ap-10f,
+ .c1 = V4 (0x1.3ce9e4p-7f),
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = V4 (0x1.ebf9bcp-3f),
+ .c4 = 0x1.62e422p-1f,
.exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
#endif
};
-#define C(i) d->poly[i]
-
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
@@ -44,16 +49,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@@ -66,16 +68,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
#endif
-float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly;
- uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
- cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special_case to fix special lanes later. This is only necessary if fenv
@@ -84,23 +84,24 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
x = vbslq_f32 (cmp, v_f32 (1), x);
#endif
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
- n = vrndaq_f32 (x);
- r = vsubq_f32 (x, n);
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = vsubq_f32 (x, n);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t c024 = vld1q_f32 (&d->c0);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_laneq_f32 (r, c024, 2);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
@@ -111,3 +112,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
+
+HALF_WIDTH_ALIAS_F1 (exp2)
+
+TEST_SIG (V, F, 1, exp2, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp2), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000)
diff --git a/math/aarch64/advsimd/exp2f_1u.c b/math/aarch64/advsimd/exp2f_1u.c
new file mode 100644
index 000000000000..1f8e89ab658f
--- /dev/null
+++ b/math/aarch64/advsimd/exp2f_1u.c
@@ -0,0 +1,73 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, c1, c2, c3, c4, c5, shift;
+ uint32x4_t exponent_bias;
+ float32x4_t special_bound, scale_thresh;
+ uint32x4_t special_offset, special_bias;
+} data = {
+ .shift = V4 (0x1.8p23f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+ /* maxerr: 0.878 ulp. */
+ .c0 = V4 (0x1.416b5ep-13f),
+ .c1 = V4 (0x1.5f082ep-10f),
+ .c2 = V4 (0x1.3b2dep-7f),
+ .c3 = V4 (0x1.c6af7cp-5f),
+ .c4 = V4 (0x1.ebfbdcp-3f),
+ .c5 = V4 (0x1.62e43p-1f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r1 = vmulq_f32 (s1, s1);
+ float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
+ /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = x - n;
+ uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+ float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+ float32x4_t p = vfmaq_f32 (d->c1, d->c0, r);
+ p = vfmaq_f32 (d->c2, p, r);
+ p = vfmaq_f32 (d->c3, p, r);
+ p = vfmaq_f32 (d->c4, p, r);
+ p = vfmaq_f32 (d->c5, p, r);
+ p = vfmaq_f32 (v_f32 (1.0f), p, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (p, n, e, d);
+ return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u)
+TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000)
diff --git a/math/aarch64/v_expf.c b/math/aarch64/advsimd/expf.c
index 34e8b6081bcd..e5b1f020d1a0 100644
--- a/math/aarch64/v_expf.c
+++ b/math/aarch64/advsimd/expf.c
@@ -1,30 +1,34 @@
/*
* Single-precision vector e^x function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-
-#include "mathlib.h"
#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
static const struct data
{
- float32x4_t poly[5];
- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
- uint32x4_t exponent_bias;
+ float32x4_t c1, c3, c4, inv_ln2;
+ float ln2_hi, ln2_lo, c0, c2;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.45358 +0.5 ulp. */
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
- .shift = V4 (0x1.8p23f),
+ .c0 = 0x1.0e4020p-7f,
+ .c1 = V4 (0x1.573e2ep-5f),
+ .c2 = 0x1.555e66p-3f,
+ .c3 = V4 (0x1.fffdb6p-2f),
+ .c4 = V4 (0x1.ffffecp-1f),
.inv_ln2 = V4 (0x1.715476p+0f),
- .ln2_hi = V4 (0x1.62e4p-1f),
- .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
.exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
@@ -49,19 +53,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
+ // (s2 + p*s2)*s1 = s2(p+1)s1
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
@@ -71,15 +73,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
#endif
-float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly, z;
- uint32x4_t cmp, e;
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
#if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
- cmp = vcgeq_u32 (
+ uint32x4_t cmp = vcgeq_u32 (
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
TinyBound),
SpecialBound);
@@ -93,23 +94,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- z = vfmaq_f32 (d->shift, x, d->inv_ln2);
- n = vsubq_f32 (z, d->shift);
- r = vfmsq_f32 (x, n, d->ln2_hi);
- r = vfmsq_f32 (r, n, d->ln2_lo);
- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
if (unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
@@ -120,3 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
return vfmaq_f32 (scale, poly, scale);
}
+
+HALF_WIDTH_ALIAS_F1 (exp)
+
+TEST_SIG (V, F, 1, exp, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (exp), 1.49)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000)
diff --git a/math/aarch64/advsimd/expf_1u.c b/math/aarch64/advsimd/expf_1u.c
new file mode 100644
index 000000000000..4e114d810e08
--- /dev/null
+++ b/math/aarch64/advsimd/expf_1u.c
@@ -0,0 +1,79 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t shift, inv_ln2;
+ uint32x4_t exponent_bias;
+ float32x4_t c1, c2, c3, c4;
+ float32x4_t special_bound, scale_thresh;
+ uint32x4_t special_offset, special_bias;
+ float ln2_hi, ln2_lo, c0, nothing;
+} data = {
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+ .shift = V4 (0x1.8p23f),
+ .inv_ln2 = V4 (0x1.715476p+0f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+ .special_offset = V4 (0x83000000),
+ .special_bias = V4 (0x7f000000),
+ /* maxerr: 0.36565 +0.5 ulp. */
+ .c0 = 0x1.6a6000p-10f,
+ .c1 = V4 (0x1.12718ep-7f),
+ .c2 = V4 (0x1.555af0p-5f),
+ .c3 = V4 (0x1.555430p-3f),
+ .c4 = V4 (0x1.fffff4p-2f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r1 = vmulq_f32 (s1, s1);
+ float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2);
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi);
+
+ /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t z = vmulq_f32 (x, d->inv_ln2);
+ float32x4_t n = vrndaq_f32 (z);
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c0, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2);
+ p = vfmaq_f32 (d->c2, p, r);
+ p = vfmaq_f32 (d->c3, p, r);
+ p = vfmaq_f32 (d->c4, p, r);
+ p = vfmaq_f32 (v_f32 (1.0f), p, r);
+ p = vfmaq_f32 (v_f32 (1.0f), p, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (p, n, e, d);
+ return scale * p;
+}
+
+TEST_ULP (_ZGVnN4v_expf_1u, 0.4)
+TEST_DISABLE_FENV (_ZGVnN4v_expf_1u)
+TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000)
diff --git a/math/aarch64/advsimd/expm1.c b/math/aarch64/advsimd/expm1.c
new file mode 100644
index 000000000000..7535a1830427
--- /dev/null
+++ b/math/aarch64/advsimd/expm1.c
@@ -0,0 +1,77 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+ struct v_expm1_data d;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t thresh, tiny_bound;
+#else
+ float64x2_t oflow_bound;
+#endif
+} data = {
+ .d = V_EXPM1_DATA,
+#if WANT_SIMD_EXCEPT
+ /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+ compare. */
+ .thresh = V2 (0x78c56fa6d34b552),
+ /* asuint64(0x1p-51) << 1. */
+ .tiny_bound = V2 (0x3cc0000000000000 << 1),
+#else
+ /* Value above which expm1(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
+#endif
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
+{
+ return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
+ special);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+ The maximum error observed error is 2.05 ULP:
+ _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2
+ want 0x1.a8897eef87b32p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
+#else
+ /* Large input, NaNs and Infs. */
+ uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
+#endif
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, special, d);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return expm1_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_D1 (expm1), 1.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
diff --git a/math/aarch64/advsimd/expm1f.c b/math/aarch64/advsimd/expm1f.c
new file mode 100644
index 000000000000..6d4431dcd8a5
--- /dev/null
+++ b/math/aarch64/advsimd/expm1f.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+ struct v_expm1f_data d;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t thresh;
+#else
+ float32x4_t oflow_bound;
+#endif
+} data = {
+ .d = V_EXPM1F_DATA,
+#if !WANT_SIMD_EXCEPT
+ /* Value above which expm1f(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V4 (0x1.5ebc4p+6),
+#else
+ /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
+ compare. */
+ .thresh = V4 (0x1d5ebc40),
+#endif
+};
+
+/* asuint(0x1p-23), shifted by 1 for abs compare. */
+#define TinyBound v_u32 (0x34000000 << 1)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+{
+ return v_call_f32 (
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+}
+
+/* Single-precision vector exp(x) - 1 function.
+ The maximum error is 1.62 ULP:
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+ want 0x1.da9f44p-2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint32x4_t special
+ = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+#else
+ /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
+ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
+#endif
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, special, d);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return expm1f_inline (x, &d->d);
+}
+
+HALF_WIDTH_ALIAS_F1 (expm1)
+
+TEST_SIG (V, F, 1, expm1, -9.9, 9.9)
+TEST_ULP (V_NAME_F1 (expm1), 1.13)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
+TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
diff --git a/pl/math/finite_pow.h b/math/aarch64/advsimd/finite_pow.h
index 8944d4fae625..0c8350a1a77b 100644
--- a/pl/math/finite_pow.h
+++ b/math/aarch64/advsimd/finite_pow.h
@@ -1,7 +1,7 @@
/*
* Double-precision x^y function.
*
- * Copyright (c) 2018-2023, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -108,7 +108,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
sbits -= 1009ull << 52;
scale = asdouble (sbits);
y = 0x1p1009 * (scale + scale * tmp);
- return check_oflow (eval_as_double (y));
+ return y;
}
/* k < 0, need special care in the subnormal range. */
sbits += 1022ull << 52;
@@ -128,7 +128,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
lo = scale - y + scale * tmp;
hi = one + y;
lo = one - hi + y + lo;
- y = eval_as_double (hi + lo) - one;
+ y = (hi + lo) - one;
/* Fix the sign of 0. */
if (y == 0.0)
y = asdouble (sbits & 0x8000000000000000);
@@ -137,7 +137,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki)
}
#endif
y = 0x1p-1022 * y;
- return check_uflow (eval_as_double (y));
+ return y;
}
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
@@ -192,7 +192,7 @@ exp_inline (double x, double xtail, uint32_t sign_bias)
double scale = asdouble (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
is no spurious underflow here even without fma. */
- return eval_as_double (scale + scale * tmp);
+ return scale + scale * tmp;
}
/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
@@ -239,7 +239,7 @@ exp_nosignbias (double x, double xtail)
double scale = asdouble (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
is no spurious underflow here even without fma. */
- return eval_as_double (scale + scale * tmp);
+ return scale + scale * tmp;
}
/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
@@ -267,7 +267,7 @@ zeroinfnan (uint64_t i)
}
static double NOINLINE
-__pl_finite_pow (double x, double y)
+pow_scalar_special_case (double x, double y)
{
uint32_t sign_bias = 0;
uint64_t ix, iy;
@@ -311,9 +311,7 @@ __pl_finite_pow (double x, double y)
if (2 * ix == 0 && iy >> 63)
return __math_divzero (sign_bias);
#endif
- /* Without the barrier some versions of clang hoist the 1/x2 and
- thus division by zero exception can be signaled spuriously. */
- return iy >> 63 ? opt_barrier_double (1 / x2) : x2;
+ return iy >> 63 ? 1 / x2 : x2;
}
/* Here x and y are non-zero finite. */
if (ix >> 63)
@@ -349,9 +347,7 @@ __pl_finite_pow (double x, double y)
if (topx == 0)
{
/* Normalize subnormal x so exponent becomes negative. */
- /* Without the barrier some versions of clang evalutate the mul
- unconditionally causing spurious overflow exceptions. */
- ix = asuint64 (opt_barrier_double (x) * 0x1p52);
+ ix = asuint64 (x * 0x1p52);
ix &= 0x7fffffffffffffff;
ix -= 52ULL << 52;
}
diff --git a/pl/math/v_hypot_1u5.c b/math/aarch64/advsimd/hypot.c
index d4ff7be89a8f..dc01ed5bac93 100644
--- a/pl/math/v_hypot_1u5.c
+++ b/math/aarch64/advsimd/hypot.c
@@ -1,13 +1,13 @@
/*
* Double-precision vector hypot(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#if WANT_SIMD_EXCEPT
static const struct data
@@ -15,7 +15,7 @@ static const struct data
uint64x2_t tiny_bound, thres;
} data = {
.tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
- .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
+ .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
};
#else
static const struct data
@@ -24,7 +24,7 @@ static const struct data
uint32x4_t thres;
} data = {
.tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
- .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
+ .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
};
#endif
@@ -75,9 +75,9 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
- uint32x2_t special = vcge_u32 (
- vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
- vget_low_u32 (d->thres));
+ uint32x2_t special
+ = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+ vget_low_u32 (d->thres));
if (unlikely (v_any_u32h (special)))
return special_case (x, y, sqsum, special);
@@ -86,10 +86,10 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
}
#endif
-PL_SIG (V, D, 2, hypot, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D2 (hypot), 1.21)
-PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
+TEST_SIG (V, D, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_D2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/pl/math/v_hypotf_1u5.c b/math/aarch64/advsimd/hypotf.c
index 3227b0a3fd8b..69634875be5a 100644
--- a/pl/math/v_hypotf_1u5.c
+++ b/math/aarch64/advsimd/hypotf.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector hypot(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#if WANT_SIMD_EXCEPT
static const struct data
@@ -15,7 +15,7 @@ static const struct data
uint32x4_t tiny_bound, thres;
} data = {
.tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
- .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
+ .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
};
#else
static const struct data
@@ -24,7 +24,7 @@ static const struct data
uint16x8_t thres;
} data = {
.tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
- .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
+ .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
};
#endif
@@ -41,7 +41,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
want 0x1.6a41dp-13. */
#if WANT_SIMD_EXCEPT
-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
{
const struct data *d = ptr_barrier (&data);
@@ -68,15 +68,15 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
}
#else
-float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
{
const struct data *d = ptr_barrier (&data);
float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
- uint16x4_t special = vcge_u16 (
- vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
- vget_low_u16 (d->thres));
+ uint16x4_t special
+ = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+ vget_low_u16 (d->thres));
if (unlikely (v_any_u16h (special)))
return special_case (x, y, sqsum, special);
@@ -85,10 +85,12 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
}
#endif
-PL_SIG (V, F, 2, hypot, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F2 (hypot), 1.21)
-PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
+HALF_WIDTH_ALIAS_F2 (hypot)
+
+TEST_SIG (V, F, 2, hypot, -10.0, 10.0)
+TEST_ULP (V_NAME_F2 (hypot), 1.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/math/aarch64/advsimd/log.c b/math/aarch64/advsimd/log.c
new file mode 100644
index 000000000000..94e3f4482079
--- /dev/null
+++ b/math/aarch64/advsimd/log.c
@@ -0,0 +1,118 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
+ uint32x4_t special_bound;
+ float64x2_t c0, c2;
+ double c1, c3, ln2, c4;
+} data = {
+ /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .c0 = V2 (-0x1.ffffffffffff7p-2),
+ .c1 = 0x1.55555555170d4p-2,
+ .c2 = V2 (-0x1.0000000399c27p-2),
+ .c3 = 0x1.999b2e90e94cap-3,
+ .c4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ .off = V2 (0x3fe6900900000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
+};
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
+{
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log routine.
+ The maximum observed error is 2.17 ULP:
+ _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+ want 0x1.ffffff1cca045p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
+ y = vfmaq_f64 (p, r2, y);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log), 1.67)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000)
+TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000)
diff --git a/math/aarch64/advsimd/log10.c b/math/aarch64/advsimd/log10.c
new file mode 100644
index 000000000000..c2b8f1c54f0e
--- /dev/null
+++ b/math/aarch64/advsimd/log10.c
@@ -0,0 +1,132 @@
+/*
+ * Double-precision vector log10(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
+ uint32x4_t special_bound;
+ double invln10, log10_2;
+ double c1, c3;
+ float64x2_t c0, c2, c4;
+} data = {
+ /* Computed from log coefficients divided by log(10) then rounded to double
+ precision. */
+ .c0 = V2 (-0x1.bcb7b1526e506p-3),
+ .c1 = 0x1.287a7636be1d1p-3,
+ .c2 = V2 (-0x1.bcb7b158af938p-4),
+ .c3 = 0x1.63c78734e6d07p-4,
+ .c4 = V2 (-0x1.287461742fee4p-4),
+ .invln10 = 0x1.bcb7b1526e50ep-2,
+ .log10_2 = 0x1.34413509f79ffp-2,
+ .off = V2 (0x3fe6900900000000),
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
+};
+
+#define N (1 << V_LOG10_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t log10c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ uint64_t i0
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ uint64_t i1
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.log10c = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
+{
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Fast implementation of double-precision vector log10
+ is a slight modification of double-precision vector log.
+ Max ULP error: < 2.5 ulp (nearest rounding.)
+ Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+ _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+ want 0x1.fff6be3cae4b9p-6. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+
+ /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ /* hi = r / log(10) + log10(c) + k*log10(2).
+ Constants in v_log10_data.c are computed (in extended precision) as
+ e.log10c := e.logc * invln10. */
+ float64x2_t cte = vld1q_f64 (&d->invln10);
+ float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
+
+ /* y = log10(1+r) + n * log10(2). */
+ hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_f64 (y, d->c4, r2);
+ y = vfmaq_f64 (p, y, r2);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log10), 1.97)
+TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
diff --git a/math/aarch64/advsimd/log10f.c b/math/aarch64/advsimd/log10f.c
new file mode 100644
index 000000000000..907c1051e086
--- /dev/null
+++ b/math/aarch64/advsimd/log10f.c
@@ -0,0 +1,106 @@
+/*
+ * Single-precision vector log10 function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
+ uint32x4_t off, offset_lower_bound;
+ uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
+ float c1, c3, c5, c7;
+} data = {
+ /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
+ .c0 = V4 (-0x1.bcb79cp-3f),
+ .c1 = 0x1.2879c8p-3f,
+ .c2 = V4 (-0x1.bcd472p-4f),
+ .c3 = 0x1.6408f8p-4f,
+ .c4 = V4 (-0x1.246f8p-4f),
+ .c5 = 0x1.f0e514p-5f,
+ .c6 = V4 (-0x1.0fc92cp-4f),
+ .c7 = 0x1.f5f76ap-5f,
+ .ln2 = V4 (0x1.62e43p-1f),
+ .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+}
+
+/* Fast implementation of AdvSIMD log10f,
+ uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
+ an order 9 polynomial.
+ Maximum error: 3.305ulps (nearest rounding.)
+ _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+ want 0x1.ffe2f4p-4. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u_off = vsubq_u32 (u_off, d->off);
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log10(1+r) + n * log10(2). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+
+ float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
+ float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
+ float32x4_t poly = vfmaq_f32 (c01, r2, p27);
+
+ /* y = Log10(2) * n + poly * InvLn(10). */
+ float32x4_t y = vfmaq_f32 (r, d->ln2, n);
+ y = vmulq_f32 (y, d->inv_ln10);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (y, u_off, poly, r2, special, d);
+ return vfmaq_f32 (y, poly, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log10)
+
+TEST_SIG (V, F, 1, log10, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log10), 2.81)
+TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
diff --git a/math/aarch64/advsimd/log1p.c b/math/aarch64/advsimd/log1p.c
new file mode 100644
index 000000000000..42a0c5793920
--- /dev/null
+++ b/math/aarch64/advsimd/log1p.c
@@ -0,0 +1,61 @@
+/*
+ * Double-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+ struct v_log1p_data d;
+ uint64x2_t inf, minus_one;
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
+ .inf = V2 (0x7ff0000000000000),
+ .minus_one = V2 (0xbff0000000000000) };
+
+#define BottomMask v_u64 (0xffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
+{
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
+ return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1p approximation using polynomial on reduced interval. Routine is
+ a modification of the algorithm used in scalar log1p, with no shortcut for
+ k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
+ _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
+ want 0x1.fd61d0727429fp+2 . */
+VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ uint64x2_t special_cases
+ = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
+
+ if (unlikely (v_any_u64 (special_cases)))
+ return special_case (x, special_cases, d);
+
+ return log1p_inline (x, &d->d);
+}
+
+TEST_SIG (V, D, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_D1 (log1p), 1.95)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
+TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
diff --git a/math/aarch64/advsimd/log1pf.c b/math/aarch64/advsimd/log1pf.c
new file mode 100644
index 000000000000..94b90249128f
--- /dev/null
+++ b/math/aarch64/advsimd/log1pf.c
@@ -0,0 +1,92 @@
+/*
+ * Single-precision vector log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
+
+const static struct data
+{
+ uint32x4_t minus_one, thresh;
+ struct v_log1pf_data d;
+} data = {
+ .d = V_LOG1PF_CONSTANTS_TABLE,
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
+ .minus_one = V4 (0xbf800000),
+};
+
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+# define TinyBound v_u32 (0x34000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
+{
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.69 ULP:
+ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+ want 0x1.cfcbdcp-3. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+ uint32x4_t special_cases
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
+ vcgeq_u32 (ix, d->minus_one));
+
+ if (unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases, d);
+
+ return log1pf_inline (x, &d->d);
+}
+
+#else
+
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.63 ULP:
+ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+ want 0x1.fdcb16p-3. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x)
+{
+ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+ vcaleq_f32 (x, v_f32 (0x1p127f)));
+
+ if (unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases);
+
+ return log1pf_inline (x, ptr_barrier (&data));
+}
+
+#endif
+
+HALF_WIDTH_ALIAS_F1 (log1p)
+
+TEST_SIG (V, F, 1, log1p, -0.9, 10.0)
+TEST_ULP (V_NAME_F1 (log1p), 1.20)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
+TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
+TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
diff --git a/math/aarch64/advsimd/log2.c b/math/aarch64/advsimd/log2.c
new file mode 100644
index 000000000000..7d2e44dad2c9
--- /dev/null
+++ b/math/aarch64/advsimd/log2.c
@@ -0,0 +1,123 @@
+/*
+ * Double-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
+ uint32x4_t special_bound;
+ float64x2_t c0, c2;
+ double c1, c3, invln2, c4;
+} data = {
+ /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+ and N = 128, then scaled by log2(e) in extended precision and rounded back
+ to double precision. */
+ .c0 = V2 (-0x1.71547652b8300p-1),
+ .c1 = 0x1.ec709dc340953p-2,
+ .c2 = V2 (-0x1.71547651c8f35p-2),
+ .c3 = 0x1.2777ebe12dda5p-2,
+ .c4 = -0x1.ec738d616fe26p-3,
+ .invln2 = 0x1.71547652b82fep0,
+ .off = V2 (0x3fe6900900000000),
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
+};
+
+#define N (1 << V_LOG2_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t log2c;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ uint64_t i0
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ uint64_t i1
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.log2c = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
+{
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
+}
+
+/* Double-precision vector log2 routine. Implements the same algorithm as
+ vector log10, with coefficients and table entries scaled in extended
+ precision. The maximum observed error is 2.58 ULP:
+ _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+ want 0x1.fffb34198d9ddp-5. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+
+ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+
+ float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
+ float64x2_t hi
+ = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
+
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
+ y = vfmaq_f64 (p, r2, y);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
+}
+
+TEST_SIG (V, D, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_D1 (log2), 2.09)
+TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
diff --git a/math/aarch64/advsimd/log2f.c b/math/aarch64/advsimd/log2f.c
new file mode 100644
index 000000000000..3053c64bc552
--- /dev/null
+++ b/math/aarch64/advsimd/log2f.c
@@ -0,0 +1,102 @@
+/*
+ * Single-precision vector log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ float32x4_t c0, c2, c4, c6, c8;
+ uint32x4_t off, offset_lower_bound;
+ uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
+ float c1, c3, c5, c7;
+} data = {
+ /* Coefficients generated using Remez algorithm approximate
+ log2(1+r)/r for r in [ -1/3, 1/3 ].
+ rel error: 0x1.c4c4b0cp-26. */
+ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
+ .c1 = -0x1.715458p-1f,
+ .c2 = V4 (0x1.ec701cp-2f),
+ .c3 = -0x1.7171a4p-2f,
+ .c4 = V4 (0x1.27a0b8p-2f),
+ .c5 = -0x1.e5143ep-3f,
+ .c6 = V4 (0x1.9d8ecap-3f),
+ .c7 = -0x1.c675bp-3f,
+ .c8 = V4 (0x1.9e495p-3f),
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+ uint16x4_t cmp, const struct data *d)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+}
+
+/* Fast implementation for single precision AdvSIMD log2,
+ relies on same argument reduction as AdvSIMD logf.
+ Maximum error: 2.48 ULPs
+ _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+ want 0x1.a9be8p-2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u_off = vsubq_u32 (u_off, d->off);
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log2(1+r) + n. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+ float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+ float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+ float32x4_t p = vfmaq_f32 (c01, r2, p28);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (n, u_off, p, r, special, d);
+ return vfmaq_f32 (n, p, r);
+}
+
+HALF_WIDTH_ALIAS_F1 (log2)
+
+TEST_SIG (V, F, 1, log2, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log2), 1.99)
+TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
diff --git a/math/aarch64/advsimd/logf.c b/math/aarch64/advsimd/logf.c
new file mode 100644
index 000000000000..84705fad05ee
--- /dev/null
+++ b/math/aarch64/advsimd/logf.c
@@ -0,0 +1,88 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+static const struct data
+{
+ float32x4_t c2, c4, c6, ln2;
+ uint32x4_t off, offset_lower_bound, mantissa_mask;
+ uint16x8_t special_bound;
+ float c1, c3, c5, c0;
+} data = {
+ /* 3.34 ulp error. */
+ .c0 = -0x1.3e737cp-3f,
+ .c1 = 0x1.5a9aa2p-3f,
+ .c2 = V4 (-0x1.4f9934p-3f),
+ .c3 = 0x1.961348p-3f,
+ .c4 = V4 (-0x1.00187cp-2f),
+ .c5 = 0x1.555d7cp-2f,
+ .c6 = V4 (-0x1.ffffc8p-2f),
+ .ln2 = V4 (0x1.62e43p-1f),
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff)
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t c1350 = vld1q_f32 (&d->c1);
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log(1+r) + n*ln2. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
+ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
+ p = vfmaq_laneq_f32 (p, r2, c1350, 3);
+
+ q = vfmaq_f32 (q, p, r2);
+ y = vfmaq_f32 (y, q, r2);
+ p = vfmaq_f32 (r, d->ln2, n);
+
+ if (unlikely (v_any_u16h (cmp)))
+ return special_case (p, u_off, y, r2, cmp, d);
+ return vfmaq_f32 (p, y, r2);
+}
+
+HALF_WIDTH_ALIAS_F1 (log)
+
+TEST_SIG (V, F, 1, log, 0.01, 11.1)
+TEST_ULP (V_NAME_F1 (log), 2.9)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT)
+TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000)
+TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000)
+TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000)
diff --git a/math/aarch64/advsimd/modf.c b/math/aarch64/advsimd/modf.c
new file mode 100644
index 000000000000..da2fcbff8514
--- /dev/null
+++ b/math/aarch64/advsimd/modf.c
@@ -0,0 +1,33 @@
+/*
+ * Double-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modf algorithm. Produces exact values in all rounding modes. */
+float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int)
+{
+ /* Get integer component of x. */
+ float64x2_t rounded = vrndq_f64 (x);
+ vst1q_f64 (out_int, rounded);
+
+ /* Subtract integer component from input. */
+ uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded));
+
+ /* Return +0 for integer x. */
+ uint64x2_t is_integer = vceqq_f64 (x, rounded);
+ return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN2vl8_modf_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000)
diff --git a/math/aarch64/advsimd/modff.c b/math/aarch64/advsimd/modff.c
new file mode 100644
index 000000000000..0a646b24cb1a
--- /dev/null
+++ b/math/aarch64/advsimd/modff.c
@@ -0,0 +1,34 @@
+/*
+ * Single-precision vector modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modff algorithm. Produces exact values in all rounding modes. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x,
+ float *out_int)
+{
+ /* Get integer component of x. */
+ float32x4_t rounded = vrndq_f32 (x);
+ vst1q_f32 (out_int, rounded);
+
+ /* Subtract integer component from input. */
+ uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded));
+
+ /* Return +0 for integer x. */
+ uint32x4_t is_integer = vceqq_f32 (x, rounded);
+ return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer));
+}
+
+TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVnN4vl4_modff_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000)
diff --git a/pl/math/v_pow_1u5.c b/math/aarch64/advsimd/pow.c
index 9053347d4e35..db9d6e9ba14b 100644
--- a/pl/math/v_pow_1u5.c
+++ b/math/aarch64/advsimd/pow.c
@@ -1,20 +1,17 @@
/*
* Double-precision vector pow function.
*
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Defines parameters of the approximation and scalar fallback. */
#include "finite_pow.h"
-#define VecSmallExp v_u64 (SmallExp)
-#define VecThresExp v_u64 (ThresExp)
-
#define VecSmallPowX v_u64 (SmallPowX)
#define VecThresPowX v_u64 (ThresPowX)
#define VecSmallPowY v_u64 (SmallPowY)
@@ -22,34 +19,49 @@
static const struct data
{
- float64x2_t log_poly[7];
- float64x2_t exp_poly[3];
- float64x2_t ln2_hi, ln2_lo;
- float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n;
+ uint64x2_t inf;
+ float64x2_t small_powx;
+ uint64x2_t offset, mask;
+ uint64x2_t mask_sub_0, mask_sub_1;
+ float64x2_t log_c0, log_c2, log_c4, log_c5;
+ double log_c1, log_c3;
+ double ln2_lo, ln2_hi;
+ uint64x2_t small_exp, thres_exp;
+ double ln2_lo_n, ln2_hi_n;
+ double inv_ln2_n, exp_c2;
+ float64x2_t exp_c0, exp_c1;
} data = {
+ /* Power threshold. */
+ .inf = V2 (0x7ff0000000000000),
+ .small_powx = V2 (0x1p-126),
+ .offset = V2 (Off),
+ .mask = V2 (0xfffULL << 52),
+ .mask_sub_0 = V2 (1ULL << 52),
+ .mask_sub_1 = V2 (52ULL << 52),
/* Coefficients copied from v_pow_log_data.c
relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
Coefficients are scaled to match the scaling during evaluation. */
- .log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2),
- V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4),
- V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8),
- V2 (-0x1.0002b8b263fc3p-3 * -8) },
- .ln2_hi = V2 (0x1.62e42fefa3800p-1),
- .ln2_lo = V2 (0x1.ef35793c76730p-45),
+ .log_c0 = V2 (0x1.555555555556p-2 * -2),
+ .log_c1 = -0x1.0000000000006p-2 * -2,
+ .log_c2 = V2 (0x1.999999959554ep-3 * 4),
+ .log_c3 = -0x1.555555529a47ap-3 * 4,
+ .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
+ .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
/* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
(0.550 without fma) if |x| < ln2/512. */
- .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
- V2 (0x1.5555576a5adcep-5) },
- .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
- .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
- .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
- .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
+ .exp_c0 = V2 (0x1.fffffffffffd4p-2),
+ .exp_c1 = V2 (0x1.5555571d6ef9p-3),
+ .exp_c2 = 0x1.5555576a5adcep-5,
+ .small_exp = V2 (0x3c90000000000000),
+ .thres_exp = V2 (0x03f0000000000000),
+ .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
+ .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
+ .ln2_lo_n = -0x1.c610ca86c3899p-45,
};
-#define A(i) data.log_poly[i]
-#define C(i) data.exp_poly[i]
-
-/* This version implements an algorithm close to AOR scalar pow but
+/* This version implements an algorithm close to scalar pow but
- does not implement the trick in the exp's specialcase subroutine to avoid
double-rounding,
- does not use a tail in the exponential core computation,
@@ -78,10 +90,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
- int64x2_t k
- = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
+ uint64x2_t tmp = vsubq_u64 (ix, d->offset);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
float64x2_t kd = vcvtq_f64_s64 (k);
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
@@ -92,12 +103,13 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
/* k*Ln2 + log(c) + r. */
- float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
+ float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
+ float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
float64x2_t t2 = vaddq_f64 (t1, r);
- float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
+ float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
/* Evaluation is optimized assuming superscalar pipelined execution. */
- float64x2_t ar = vmulq_f64 (A (0), r);
+ float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
float64x2_t ar2 = vmulq_f64 (r, ar);
float64x2_t ar3 = vmulq_f64 (r, ar2);
/* k*Ln2 + log(c) + r + A[0]*r*r. */
@@ -105,9 +117,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
/* p = log1p(r) - r - A[0]*r*r. */
- float64x2_t a56 = vfmaq_f64 (A (5), r, A (6));
- float64x2_t a34 = vfmaq_f64 (A (3), r, A (4));
- float64x2_t a12 = vfmaq_f64 (A (1), r, A (2));
+ float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
+ float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
+ float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
+ float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
float64x2_t p = vfmaq_f64 (a34, ar2, a56);
p = vfmaq_f64 (a12, ar2, p);
p = vmulq_f64 (ar3, p);
@@ -118,29 +131,37 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
return y;
}
+static float64x2_t VPCS_ATTR NOINLINE
+exp_special_case (float64x2_t x, float64x2_t xtail)
+{
+ return (float64x2_t){ exp_nosignbias (x[0], xtail[0]),
+ exp_nosignbias (x[1], xtail[1]) };
+}
+
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
static inline float64x2_t
-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
{
/* Fallback to scalar exp_inline for all lanes if any lane
contains value of x s.t. |x| <= 2^-54 or >= 512. */
- uint64x2_t abstop
- = vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff));
- uint64x2_t uoflowx
- = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
+ uint64x2_t uoflowx = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
+ d->thres_exp);
if (unlikely (v_any_u64 (uoflowx)))
- return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1));
+ return exp_special_case (x, vnegq_f64 (neg_xtail));
+
/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
/* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
- float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
- float64x2_t kd = vaddq_f64 (z, d->shift);
- uint64x2_t ki = vreinterpretq_u64_f64 (kd);
- kd = vsubq_f64 (kd, d->shift);
- float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
- r = vfmsq_f64 (r, kd, d->ln2_lo_n);
+ float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
+ float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
+ float64x2_t kd = vrndnq_f64 (z);
+ uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
+ float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
+ float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
+ r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
- r = vaddq_f64 (r, xtail);
+ r = vsubq_f64 (r, neg_xtail);
/* 2^(k/N) ~= scale. */
uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
@@ -149,8 +170,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
sbits = vaddq_u64 (sbits, top);
/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
- tmp = vfmaq_f64 (C (0), r, tmp);
+ float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
+ tmp = vfmaq_f64 (d->exp_c0, r, tmp);
tmp = vfmaq_f64 (r, r2, tmp);
float64x2_t scale = vreinterpretq_f64_u64 (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
@@ -158,54 +179,59 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
return vfmaq_f64 (scale, scale, tmp);
}
+static float64x2_t NOINLINE VPCS_ATTR
+scalar_fallback (float64x2_t x, float64x2_t y)
+{
+ return (float64x2_t){ pow_scalar_special_case (x[0], y[0]),
+ pow_scalar_special_case (x[1], y[1]) };
+}
+
float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
{
const struct data *d = ptr_barrier (&data);
/* Case of x <= 0 is too complicated to be vectorised efficiently here,
fallback to scalar pow for all lanes if any x < 0 detected. */
if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
- return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
+ return scalar_fallback (x, y);
uint64x2_t vix = vreinterpretq_u64_f64 (x);
uint64x2_t viy = vreinterpretq_u64_f64 (y);
- uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
- uint64x2_t vtopy = vshrq_n_u64 (viy, 52);
- uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff));
- uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff));
+ uint64x2_t iay = vandq_u64 (viy, d->inf);
/* Special cases of x or y. */
#if WANT_SIMD_EXCEPT
/* Small or large. */
+ uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
+ uint64x2_t vabstopy = vshrq_n_u64 (iay, 52);
uint64x2_t specialx
= vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
uint64x2_t specialy
= vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
#else
- /* Inf or nan. */
- uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff));
- uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff));
/* The case y==0 does not trigger a special case, since in this case it is
necessary to fix the result only if x is a signalling nan, which already
triggers a special case. We test y==0 directly in the scalar fallback. */
+ uint64x2_t iax = vandq_u64 (vix, d->inf);
+ uint64x2_t specialx = vcgeq_u64 (iax, d->inf);
+ uint64x2_t specialy = vcgeq_u64 (iay, d->inf);
#endif
uint64x2_t special = vorrq_u64 (specialx, specialy);
/* Fallback to scalar on all lanes if any lane is inf or nan. */
if (unlikely (v_any_u64 (special)))
- return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
+ return scalar_fallback (x, y);
/* Small cases of x: |x| < 0x1p-126. */
- uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX);
+ uint64x2_t smallx = vcaltq_f64 (x, d->small_powx);
if (unlikely (v_any_u64 (smallx)))
{
/* Update ix if top 12 bits of x are 0. */
- uint64x2_t sub_x = vceqzq_u64 (vtopx);
+ uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52));
if (unlikely (v_any_u64 (sub_x)))
{
/* Normalize subnormal x so exponent becomes negative. */
- uint64x2_t vix_norm
- = vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52)));
- vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff));
- vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
+ uint64x2_t vix_norm = vreinterpretq_u64_f64 (
+ vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
+ vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
vix = vbslq_u64 (sub_x, vix_norm, vix);
}
}
@@ -216,21 +242,20 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
/* Vector Exp(y_loghi, y_loglo). */
float64x2_t vehi = vmulq_f64 (y, vhi);
- float64x2_t velo = vmulq_f64 (y, vlo);
float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
- velo = vsubq_f64 (velo, vemi);
- return v_exp_inline (vehi, velo, d);
+ float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
+ return v_exp_inline (vehi, neg_velo, d);
}
-PL_SIG (V, D, 2, pow)
-PL_TEST_ULP (V_NAME_D2 (pow), 0.55)
-PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
+TEST_SIG (V, D, 2, pow)
+TEST_ULP (V_NAME_D2 (pow), 0.55)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
/* Wide intervals spanning the whole domain but shared between x and y. */
-#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
- PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
#define EXPAND(str) str##000000000
#define SHL52(str) EXPAND (str)
V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
@@ -248,12 +273,12 @@ V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
/* x is negative, y is odd or even integer, or y is real not integer. */
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
/* 1.0^y. */
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
-PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
diff --git a/math/aarch64/advsimd/powf.c b/math/aarch64/advsimd/powf.c
new file mode 100644
index 000000000000..47f74cf38ab0
--- /dev/null
+++ b/math/aarch64/advsimd/powf.c
@@ -0,0 +1,209 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min. */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A d->log2_poly
+#define C d->exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct data
+{
+ struct
+ {
+ double invc, logc;
+ } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+ float64x2_t log2_poly[4];
+ uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+ float64x2_t exp2f_poly[3];
+} data = {
+ .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+ {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+ {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+ {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+ {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+ {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+ {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+ {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+ {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+ {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+ {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+ {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+ {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+ {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+ {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+ {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+ {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+ {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+ {0x1p+0, 0x0p+0 * Scale},
+ {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+ {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+ {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+ {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+ {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+ {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+ {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+ {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+ {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+ {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+ {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+ {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+ {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+ .log2_poly = { /* rel err: 1.5 * 2^-30. */
+ V2 (-0x1.6ff5daa3b3d7cp-2 * Scale),
+ V2 (0x1.ec81d03c01aebp-2 * Scale),
+ V2 (-0x1.71547bb43f101p-1 * Scale),
+ V2 (0x1.7154764a815cbp0 * Scale)},
+ .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+ 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+ 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+ 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+ 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+ 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+ 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+ 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+ 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+ 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+ 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+ .exp2f_poly = { /* rel err: 1.69 * 2^-34. */
+ V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale),
+ V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale),
+ V2 (0x1.62e42ff0c52d6p-1 / Scale)}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+ return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+static inline float64x2_t
+ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k,
+ float64x2_t invc, float64x2_t logc, float64x2_t y)
+{
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc);
+ float64x2_t y0 = vaddq_f64 (logc, k);
+
+ /* Polynomial to approximate log1p(r)/ln2. */
+ float64x2_t logx = vfmaq_f64 (A[1], r, A[0]);
+ logx = vfmaq_f64 (A[2], logx, r);
+ logx = vfmaq_f64 (A[3], logx, r);
+ logx = vfmaq_f64 (y0, logx, r);
+
+ return vmulq_f64 (logx, y);
+}
+
+static inline float64x2_t
+log2_lookup (const struct data *d, uint32_t i)
+{
+ return vld1q_f64 (
+ &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc);
+}
+
+static inline uint64x1_t
+exp2f_lookup (const struct data *d, uint64_t i)
+{
+ return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]);
+}
+
+static inline float32x2_t
+powf_core (const struct data *d, float64x2_t ylogx)
+{
+ /* N*x = k + r with r in [-1/2, 1/2]. */
+ float64x2_t kd = vrndnq_f64 (ylogx);
+ int64x2_t ki = vcvtaq_s64_f64 (ylogx);
+ float64x2_t r = vsubq_f64 (ylogx, kd);
+
+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
+ uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)),
+ exp2f_lookup (d, vgetq_lane_s64 (ki, 1)));
+ t = vaddq_u64 (
+ t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS)));
+ float64x2_t s = vreinterpretq_f64_u64 (t);
+ float64x2_t p = vfmaq_f64 (C[1], r, C[0]);
+ p = vfmaq_f64 (C[2], r, p);
+ p = vfmaq_f64 (s, p, vmulq_f64 (s, r));
+ return vcvt_f32_f64 (p);
+}
+
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t u = vreinterpretq_u32_f32 (x);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+ uint32x4_t tmp = vsubq_u32 (u, Off);
+ uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+ float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top));
+ int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+ 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
+
+ /* Use double precision for each lane: split input vectors into lo and hi
+ halves and promote. */
+ float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)),
+ tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)),
+ tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)),
+ tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3));
+
+ float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)),
+ iz_hi = vcvt_high_f64_f32 (iz);
+
+ float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))),
+ k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k));
+
+ float64x2_t invc_lo = vzip1q_f64 (tab0, tab1),
+ invc_hi = vzip1q_f64 (tab2, tab3),
+ logc_lo = vzip2q_f64 (tab0, tab1),
+ logc_hi = vzip2q_f64 (tab2, tab3);
+
+ float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)),
+ y_hi = vcvt_high_f64_f32 (y);
+
+ float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo);
+ float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi);
+
+ uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo),
+ vreinterpretq_u32_f64 (ylogx_hi));
+
+ cmp = vorrq_u32 (
+ cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)),
+ vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS))
+ >> 47)));
+
+ float32x2_t p_lo = powf_core (d, ylogx_lo);
+ float32x2_t p_hi = powf_core (d, ylogx_hi);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp);
+ return vcombine_f32 (p_lo, p_hi);
+}
+
+HALF_WIDTH_ALIAS_F2 (pow)
+
+TEST_SIG (V, F, 2, pow)
+TEST_ULP (V_NAME_F2 (pow), 2.1)
+TEST_DISABLE_FENV (V_NAME_F2 (pow))
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
+TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)
diff --git a/math/aarch64/v_sin.c b/math/aarch64/advsimd/sin.c
index 04129c31133d..0461bbb99405 100644
--- a/math/aarch64/v_sin.c
+++ b/math/aarch64/advsimd/sin.c
@@ -1,17 +1,19 @@
/*
* Double-precision vector sin function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "test_defs.h"
+#include "test_sig.h"
#include "mathlib.h"
#include "v_math.h"
static const struct data
{
float64x2_t poly[7];
- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
@@ -23,12 +25,13 @@ static const struct data
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
};
#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u64 (0x1160000000000000)
#endif
#define C(i) d->poly[i]
@@ -61,16 +64,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
fenv). These lanes will be fixed by special-case handler later. */
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
#else
r = x;
cmp = vcageq_f64 (x, d->range_val);
#endif
/* n = rint(|x|/pi). */
- n = vfmaq_f64 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
+ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
@@ -95,3 +97,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
+
+TEST_SIG (V, D, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (sin), 3.0)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000)
diff --git a/pl/math/v_sincos_3u5.c b/math/aarch64/advsimd/sincos.c
index 6fc014c120b8..83bfa45efa98 100644
--- a/pl/math/v_sincos_3u5.c
+++ b/math/aarch64/advsimd/sincos.c
@@ -1,7 +1,7 @@
/*
* Double-precision vector sincos function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -10,12 +10,21 @@
be linked against the scalar sincosf from math/. */
#define _GNU_SOURCE
#include <math.h>
-#undef _GNU_SOURCE
#include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
#include "v_sincos_common.h"
+/* sincos not available for all scalar libm implementations. */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincos (double x, double *out_sin, double *out_cos)
+{
+ *out_sin = sin (x);
+ *out_cos = cos (x);
+}
+#endif
+
static void VPCS_ATTR NOINLINE
special_case (float64x2_t x, uint64x2_t special, double *out_sin,
double *out_cos)
@@ -46,12 +55,13 @@ _ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
special_case (x, special, out_sin, out_cos);
}
-PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
-PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin)
+TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
+TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
#define V_SINCOS_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
-V_SINCOS_INTERVAL (0, 0x1p23, 500000)
-V_SINCOS_INTERVAL (-0, -0x1p23, 500000)
+ TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
+V_SINCOS_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000)
V_SINCOS_INTERVAL (0x1p23, inf, 10000)
-V_SINCOS_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/pl/math/v_sincosf_1u8.c b/math/aarch64/advsimd/sincosf.c
index bf77afaa14db..cd482f38d5f6 100644
--- a/pl/math/v_sincosf_1u8.c
+++ b/math/aarch64/advsimd/sincosf.c
@@ -1,7 +1,7 @@
/*
* Single-precision vector sincos function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -10,11 +10,20 @@
be linked against the scalar sincosf from math/. */
#define _GNU_SOURCE
#include <math.h>
-#undef _GNU_SOURCE
#include "v_sincosf_common.h"
#include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
+
+/* sincos not available for all scalar libm implementations. */
+#if defined(_MSC_VER) || !defined(__GLIBC__)
+static void
+sincosf (float x, float *out_sin, float *out_cos)
+{
+ *out_sin = sinf (x);
+ *out_cos = cosf (x);
+}
+#endif
static void VPCS_ATTR NOINLINE
special_case (float32x4_t x, uint32x4_t special, float *out_sin,
@@ -47,12 +56,13 @@ _ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
special_case (x, special, out_sin, out_cos);
}
-PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
-PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos)
+TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
+TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
#define V_SINCOSF_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
-V_SINCOSF_INTERVAL (0, 0x1p20, 500000)
-V_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
+ TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
+V_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
+V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
-V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/math/aarch64/advsimd/sincospi.c b/math/aarch64/advsimd/sincospi.c
new file mode 100644
index 000000000000..fd425202ce67
--- /dev/null
+++ b/math/aarch64/advsimd/sincospi.c
@@ -0,0 +1,44 @@
+/*
+ * Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_sincospi_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using separate argument reduction and shared low-order
+ polynomials.
+ Approximation for vector double-precision sincospi(x).
+ Maximum Error 3.09 ULP:
+ _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+ want 0x1.fd54d0b327cf4p-1
+ Maximum Error 3.16 ULP:
+ _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+ want 0x1.fd2da484ff402p-1. */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos)
+{
+ const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data);
+
+ float64x2x2_t sc = v_sincospi_inline (x, d);
+
+ vst1q_f64 (out_sin, sc.val[0]);
+ vst1q_f64 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos)
+TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin)
+TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59)
+TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66)
+# define V_SINCOSPI_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n)
+V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000)
+V_SINCOSPI_INTERVAL (0x1p63, inf, 10000)
+#endif
diff --git a/math/aarch64/advsimd/sincospif.c b/math/aarch64/advsimd/sincospif.c
new file mode 100644
index 000000000000..760ea3d4f5e1
--- /dev/null
+++ b/math/aarch64/advsimd/sincospif.c
@@ -0,0 +1,43 @@
+/*
+ * Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincospif_common.h"
+#include "v_math.h"
+#include "test_defs.h"
+#include "mathlib.h"
+
+/* Single-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.04 ULP:
+ _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ Worst-case error for cos is 3.18 ULP:
+ _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos)
+{
+ const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data);
+
+ float32x4x2_t sc = v_sincospif_inline (x, d);
+
+ vst1q_f32 (out_sin, sc.val[0]);
+ vst1q_f32 (out_cos, sc.val[1]);
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin)
+TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos)
+TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54)
+TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68)
+# define V_SINCOSPIF_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n)
+V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000)
+V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000)
+V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
+V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
+#endif
diff --git a/math/aarch64/v_sinf.c b/math/aarch64/advsimd/sinf.c
index 336879844459..0764434039a0 100644
--- a/math/aarch64/v_sinf.c
+++ b/math/aarch64/advsimd/sinf.c
@@ -1,17 +1,19 @@
/*
* Single-precision vector sin function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
static const struct data
{
float32x4_t poly[4];
- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -22,13 +24,14 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
.range_val = V4 (0x1p20f)
};
#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u32 (0x27800000)
#endif
#define C(i) d->poly[i]
@@ -41,7 +44,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
return v_call_f32 (sinf, x, y, cmp);
}
-float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, r, r2, y;
@@ -53,23 +56,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
#else
r = x;
cmp = vcageq_f32 (x, d->range_val);
#endif
- /* n = rint(|x|/pi) */
- n = vfmaq_f32 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
+ /* n = rint(|x|/pi). */
+ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f32 (r, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
- /* y = sin(r) */
+ /* y = sin(r). */
r2 = vmulq_f32 (r, r);
y = vfmaq_f32 (C (2), C (3), r2);
y = vfmaq_f32 (C (1), y, r2);
@@ -80,3 +82,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
return special_case (x, y, odd, cmp);
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
+
+HALF_WIDTH_ALIAS_F1 (sin)
+
+TEST_SIG (V, F, 1, sin, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (sin), 1.4)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000)
diff --git a/math/aarch64/advsimd/sinh.c b/math/aarch64/advsimd/sinh.c
new file mode 100644
index 000000000000..f65ccd0c6270
--- /dev/null
+++ b/math/aarch64/advsimd/sinh.c
@@ -0,0 +1,80 @@
+/*
+ * Double-precision vector sinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+ struct v_expm1_data d;
+ uint64x2_t halff;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t tiny_bound, thresh;
+#else
+ float64x2_t large_bound;
+#endif
+} data = {
+ .d = V_EXPM1_DATA,
+ .halff = V2 (0x3fe0000000000000),
+#if WANT_SIMD_EXCEPT
+ /* 2^-26, below which sinh(x) rounds to x. */
+ .tiny_bound = V2 (0x3e50000000000000),
+ /* asuint(large_bound) - asuint(tiny_bound). */
+ .thresh = V2 (0x0230000000000000),
+#else
+ /* 2^9. expm1 helper overflows for large input. */
+ .large_bound = V2 (0x1p+9),
+#endif
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
+{
+ return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The greatest observed error is 2.52 ULP:
+ _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+ want -0x1.ac2f05bb66fc9p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ float64x2_t halfsign = vreinterpretq_f64_u64 (
+ vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t special = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+#else
+ uint64x2_t special = vcageq_f64 (x, d->large_bound);
+#endif
+
+ /* Fall back to scalar variant for all lanes if any of them are special. */
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x);
+
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
+ using a slight rearrangement of the definition of sinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ float64x2_t t = expm1_inline (ax, &d->d);
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+ return vmulq_f64 (t, halfsign);
+}
+
+TEST_SIG (V, D, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (sinh), 2.02)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
diff --git a/pl/math/v_sinhf_2u3.c b/math/aarch64/advsimd/sinhf.c
index cd8c0f08f784..12dbe26b425b 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/math/aarch64/advsimd/sinhf.c
@@ -1,28 +1,25 @@
/*
* Single-precision vector sinh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
+#include "test_sig.h"
+#include "test_defs.h"
#include "v_expm1f_inline.h"
static const struct data
{
struct v_expm1f_data expm1f_consts;
- uint32x4_t halff;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound, thresh;
#else
- uint32x4_t oflow_bound;
+ float32x4_t oflow_bound;
#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
- .halff = V4 (0x3f000000),
#if WANT_SIMD_EXCEPT
/* 0x1.6a09e8p-32, below which expm1f underflows. */
.tiny_bound = V4 (0x2fb504f4),
@@ -30,14 +27,15 @@ static const struct data
.thresh = V4 (0x12fbbbb3),
#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
- .oflow_bound = V4 (0x42b0c0a7),
+ .oflow_bound = V4 (0x1.61814ep+6),
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+ uint32x4_t special)
{
- return v_call_f32 (sinhf, x, y, special);
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
@@ -45,21 +43,21 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
The maximum error is 2.26 ULP:
_ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
want 0x1.e469e4p-4. */
-float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t sign = veorq_u32 (ix, iax);
- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
#if WANT_SIMD_EXCEPT
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+ uint32x4_t special = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
ax = v_zerofy_f32 (ax, special);
#else
- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
@@ -71,14 +69,16 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
/* Fall back to the scalar variant for any lanes that should trigger an
exception. */
if (unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (t, halfsign), special);
+ return special_case (x, t, halfsign, special);
return vmulq_f32 (t, halfsign);
}
-PL_SIG (V, F, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (sinh), 1.76)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
+HALF_WIDTH_ALIAS_F1 (sinh)
+
+TEST_SIG (V, F, 1, sinh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (sinh), 1.76)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
diff --git a/pl/math/v_sinpi_3u1.c b/math/aarch64/advsimd/sinpi.c
index 8d2917ff8ecd..f86d167a2ac3 100644
--- a/pl/math/v_sinpi_3u1.c
+++ b/math/aarch64/advsimd/sinpi.c
@@ -1,15 +1,15 @@
/*
* Double-precision vector sinpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -34,7 +34,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
- return v_call_f64 (sinpi, x, y, cmp);
+ return v_call_f64 (arm_math_sinpi, x, y, cmp);
}
#endif
@@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
}
-PL_SIG (V, D, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_D1 (sinpi), 2.56)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#endif
diff --git a/pl/math/v_sinpif_3u.c b/math/aarch64/advsimd/sinpif.c
index 3d6eeff333f7..98ba9d84d2fb 100644
--- a/pl/math/v_sinpif_3u.c
+++ b/math/aarch64/advsimd/sinpif.c
@@ -1,15 +1,15 @@
/*
* Single-precision vector sinpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -29,7 +29,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
{
/* Fall back to scalar code. */
y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
- return v_call_f32 (sinpif, x, y, cmp);
+ return v_call_f32 (arm_math_sinpif, x, y, cmp);
}
#endif
@@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
Maximum Error 3.03 ULP:
_ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
want 0x1.f7cd5p-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -72,10 +72,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
}
-PL_SIG (V, F, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
+HALF_WIDTH_ALIAS_F1 (sinpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (V_NAME_F1 (sinpi), 2.54)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
+TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
+#endif
diff --git a/pl/math/v_tan_3u5.c b/math/aarch64/advsimd/tan.c
index c431c8c4889e..957f9aba3a1e 100644
--- a/pl/math/v_tan_3u5.c
+++ b/math/aarch64/advsimd/tan.c
@@ -1,19 +1,20 @@
/*
* Double-precision vector tan(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float64x2_t poly[9];
- float64x2_t half_pi, two_over_pi, shift;
+ double half_pi[2];
+ float64x2_t two_over_pi, shift;
#if !WANT_SIMD_EXCEPT
float64x2_t range_val;
#endif
@@ -71,8 +72,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
float64x2_t r = x;
- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
+ float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+ r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+ r = vfmsq_laneq_f64 (r, q, half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
r = vmulq_n_f64 (r, 0.5);
@@ -112,9 +114,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
vbslq_f64 (no_recip, d, n));
}
-PL_SIG (V, D, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME_D1 (tan), 2.99)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
+TEST_SIG (V, D, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_D1 (tan), 2.99)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
diff --git a/pl/math/v_tanf_3u5.c b/math/aarch64/advsimd/tanf.c
index 98948b0a9ecf..ed5448649f6c 100644
--- a/pl/math/v_tanf_3u5.c
+++ b/math/aarch64/advsimd/tanf.c
@@ -1,19 +1,19 @@
/*
* Single-precision vector tan(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "v_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float32x4_t poly[6];
- float32x4_t pi_consts;
+ float pi_consts[4];
float32x4_t shift;
#if !WANT_SIMD_EXCEPT
float32x4_t range_val;
@@ -64,7 +64,7 @@ eval_poly (float32x4_t z, const struct data *d)
Maximum error is 3.45 ULP:
__v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
want 0x1.ff9850p-1. */
-float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t special_arg = x;
@@ -85,16 +85,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
#endif
/* n = rint(x/(pi/2)). */
- float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
+ float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
+ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
float32x4_t n = vsubq_f32 (q, d->shift);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
float32x4_t r;
- r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
+ r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
/* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form
@@ -119,9 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
return vbslq_f32 (pred_alt, inv_y, y);
}
-PL_SIG (V, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME_F1 (tan), 2.96)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
+HALF_WIDTH_ALIAS_F1 (tan)
+
+TEST_SIG (V, F, 1, tan, -3.1, 3.1)
+TEST_ULP (V_NAME_F1 (tan), 2.96)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
diff --git a/math/aarch64/advsimd/tanh.c b/math/aarch64/advsimd/tanh.c
new file mode 100644
index 000000000000..3dc6e5527ffc
--- /dev/null
+++ b/math/aarch64/advsimd/tanh.c
@@ -0,0 +1,67 @@
+/*
+ * Double-precision vector tanh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_expm1_inline.h"
+
+static const struct data
+{
+ struct v_expm1_data d;
+ uint64x2_t thresh, tiny_bound;
+} data = {
+ .d = V_EXPM1_DATA,
+ .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
+ .thresh = V2 (0x01f241bf835f9d5f),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
+ uint64x2_t special)
+{
+ return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+ version of expm1. The greatest observed error is 2.70 ULP:
+ _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
+ want -0x1.be5452a6459fbp-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ float64x2_t u = x;
+
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
+ /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+ They will be fixed up later by the special-case handler. */
+ if (unlikely (v_any_u64 (special)))
+ u = v_zerofy_f64 (u, special);
+#endif
+
+ u = vaddq_f64 (u, u);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ float64x2_t q = expm1_inline (u, &d->d);
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, q, qp2, special);
+ return vdivq_f64 (q, qp2);
+}
+
+TEST_SIG (V, D, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_D1 (tanh), 2.21)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
diff --git a/pl/math/v_tanhf_2u6.c b/math/aarch64/advsimd/tanhf.c
index d1cb9fb6eeb3..18fe93c7e7ba 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/math/aarch64/advsimd/tanhf.c
@@ -1,14 +1,13 @@
/*
* Single-precision vector tanh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
+#include "test_sig.h"
+#include "test_defs.h"
#include "v_expm1f_inline.h"
static const struct data
@@ -20,20 +19,23 @@ static const struct data
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
.boring_bound = V4 (0x41102cb3),
.large_bound = V4 (0x7f800000),
- .onef = V4 (0x3f800000),
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+ float32x4_t q, uint32x4_t special)
{
- return v_call_f32 (tanhf, x, y, special);
+ return v_call_f32 (
+ tanhf, x,
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+ special);
}
/* Approximation for single-precision vector tanh(x), using a simplified
version of expm1f. The maximum error is 2.58 ULP:
_ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
want 0x1.f9ba08p-5. */
-float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -42,7 +44,9 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
@@ -58,16 +62,20 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
if (unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vbslq_f32 (is_boring, boring, y), special);
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+ special);
+
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
return vbslq_f32 (is_boring, boring, y);
}
-PL_SIG (V, F, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (tanh), 2.09)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
+HALF_WIDTH_ALIAS_F1 (tanh)
+
+TEST_SIG (V, F, 1, tanh, -10.0, 10.0)
+TEST_ULP (V_NAME_F1 (tanh), 2.09)
+TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
diff --git a/math/aarch64/advsimd/tanpi.c b/math/aarch64/advsimd/tanpi.c
new file mode 100644
index 000000000000..16de00ad5556
--- /dev/null
+++ b/math/aarch64/advsimd/tanpi.c
@@ -0,0 +1,88 @@
+/*
+ * Double-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpi_data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12;
+ double c1, c3, c5, c7, c9, c11, c13, c14;
+} tanpi_data = {
+ /* Coefficents for tan(pi * x) computed with fpminimax
+ on [ 0x1p-1022 0x1p-2 ]
+ approx rel error: 0x1.7eap-55
+ approx abs error: 0x1.7eap-55. */
+ .c0 = V2 (0x1.921fb54442d18p1), /* pi. */
+ .c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5),
+ .c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9),
+ .c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13),
+ .c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17),
+ .c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21),
+ .c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27),
+ .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32,
+};
+
+/* Approximation for double-precision vector tanpi(x)
+ The maximum error is 3.06 ULP:
+ _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
+ want -0x1.fa30112702c95p+3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x)
+{
+ const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
+
+ float64x2_t n = vrndnq_f64 (x);
+
+ /* inf produces nan that propagates. */
+ float64x2_t xr = vsubq_f64 (x, n);
+ float64x2_t ar = vabdq_f64 (x, n);
+ uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25));
+ float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar);
+
+ /* Order-14 pairwise Horner. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t r4 = vmulq_f64 (r2, r2);
+
+ float64x2_t c_1_3 = vld1q_f64 (&d->c1);
+ float64x2_t c_5_7 = vld1q_f64 (&d->c5);
+ float64x2_t c_9_11 = vld1q_f64 (&d->c9);
+ float64x2_t c_13_14 = vld1q_f64 (&d->c13);
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0);
+
+ float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1);
+ p = vfmaq_f64 (p1011, r4, p);
+ p = vfmaq_f64 (p89, r4, p);
+ p = vfmaq_f64 (p67, r4, p);
+ p = vfmaq_f64 (p45, r4, p);
+ p = vfmaq_f64 (p23, r4, p);
+ p = vfmaq_f64 (p01, r4, p);
+ p = vmulq_f64 (r, p);
+
+ float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p);
+ float64x2_t y = vbslq_f64 (flip, p_recip, p);
+
+ uint64x2_t sign
+ = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar));
+ return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_D1 (tanpi))
+TEST_ULP (V_NAME_D1 (tanpi), 2.57)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
+TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000)
+#endif
diff --git a/math/aarch64/advsimd/tanpif.c b/math/aarch64/advsimd/tanpif.c
new file mode 100644
index 000000000000..7bd6d206819f
--- /dev/null
+++ b/math/aarch64/advsimd/tanpif.c
@@ -0,0 +1,70 @@
+/*
+ * Single-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpif_data
+{
+ float32x4_t c0, c2, c4, c6;
+ float c1, c3, c5, c7;
+} tanpif_data = {
+ /* Coefficents for tan(pi * x). */
+ .c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f),
+ .c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
+ .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
+};
+
+/* Approximation for single-precision vector tanpi(x)
+ The maximum error is 3.34 ULP:
+ _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
+ want 0x1.f70aa6p+2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
+{
+ const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
+
+ float32x4_t n = vrndnq_f32 (x);
+
+ /* inf produces nan that propagates. */
+ float32x4_t xr = vsubq_f32 (x, n);
+ float32x4_t ar = vabdq_f32 (x, n);
+ uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
+ float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
+
+ /* Order-7 pairwise Horner polynomial evaluation scheme. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+
+ float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
+ float32x4_t p = vfmaq_f32 (p45, r4, p67);
+ p = vfmaq_f32 (p23, r4, p);
+ p = vfmaq_f32 (p01, r4, p);
+
+ p = vmulq_f32 (r, p);
+ float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
+ float32x4_t y = vbslq_f32 (flip, p_recip, p);
+
+ uint32x4_t sign
+ = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
+ return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
+}
+
+HALF_WIDTH_ALIAS_F1 (tanpi)
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
+TEST_ULP (V_NAME_F1 (tanpi), 2.84)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
+TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
+#endif
diff --git a/math/aarch64/advsimd/v_expf_inline.h b/math/aarch64/advsimd/v_expf_inline.h
new file mode 100644
index 000000000000..797d217820c3
--- /dev/null
+++ b/math/aarch64/advsimd/v_expf_inline.h
@@ -0,0 +1,58 @@
+/*
+ * Helper for single-precision routines which calculate exp(ax) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPF_INLINE_H
+#define MATH_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+ float ln2_hi, ln2_lo, c0, c2;
+ float32x4_t inv_ln2, c1, c3, c4;
+ /* asuint(1.0f). */
+ uint32x4_t exponent_bias;
+};
+
+/* maxerr: 1.45358 +0.5 ulp. */
+#define V_EXPF_DATA \
+ { \
+ .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
+ .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
+ }
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+ /* Helper routine for calculating exp(ax).
+ Copied from v_expf.c, with all special-case handling removed - the
+ calling routine should handle special values if required. */
+
+ /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+ /* Custom order-4 Estrin avoids building high order monomial. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif // MATH_V_EXPF_INLINE_H
diff --git a/math/aarch64/advsimd/v_expm1_inline.h b/math/aarch64/advsimd/v_expm1_inline.h
new file mode 100644
index 000000000000..82d2e9415d93
--- /dev/null
+++ b/math/aarch64/advsimd/v_expm1_inline.h
@@ -0,0 +1,86 @@
+/*
+ * Helper for double-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1_INLINE_H
+#define MATH_V_EXPM1_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1_data
+{
+ float64x2_t c2, c4, c6, c8;
+ float64x2_t invln2;
+ int64x2_t exponent_bias;
+ double c1, c3, c5, c7, c9, c10;
+ double ln2[2];
+};
+
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
+#define V_EXPM1_DATA \
+ { \
+ .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \
+ .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \
+ .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \
+ .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \
+ .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \
+ .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \
+ .invln2 = V2 (0x1.71547652b82fep0), \
+ .exponent_bias = V2 (0x3ff0000000000000), \
+ }
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
+ int64x2_t i = vcvtq_s64_f64 (n);
+ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+ f = vfmsq_laneq_f64 (f, n, ln2, 1);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+ float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+ float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+ float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+ float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+ p = vfmaq_f64 (p47, f4, p);
+ p = vfmaq_f64 (p03, f4, p);
+
+ p = vfmaq_f64 (f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+ float64x2_t t = vreinterpretq_f64_s64 (u);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+#endif // MATH_V_EXPM1_INLINE_H
diff --git a/math/aarch64/advsimd/v_expm1f_inline.h b/math/aarch64/advsimd/v_expm1f_inline.h
new file mode 100644
index 000000000000..463b07aa7705
--- /dev/null
+++ b/math/aarch64/advsimd/v_expm1f_inline.h
@@ -0,0 +1,62 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) - 1 and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_EXPM1F_INLINE_H
+#define MATH_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1f_data
+{
+ float32x4_t c0, c2;
+ int32x4_t exponent_bias;
+ float c1, c3, inv_ln2, c4;
+ float ln2_hi, ln2_lo;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+ log(2)/2]. Exponent bias is asuint(1.0f). */
+#define V_EXPM1F_DATA \
+ { \
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
+
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
+ float32x4_t f2 = vmulq_f32 (f, f);
+ float32x4_t f4 = vmulq_f32 (f2, f2);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
+ p = vfmaq_f32 (f, f2, p);
+
+ /* t = 2^i. */
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t t = vreinterpretq_f32_s32 (u);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
+
+#endif // MATH_V_EXPM1F_INLINE_H
diff --git a/math/aarch64/advsimd/v_log1p_inline.h b/math/aarch64/advsimd/v_log1p_inline.h
new file mode 100644
index 000000000000..ef906ae4b603
--- /dev/null
+++ b/math/aarch64/advsimd/v_log1p_inline.h
@@ -0,0 +1,119 @@
+/*
+ * Helper for vector double-precision routines which calculate log(1 + x) and
+ * do not need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef MATH_V_LOG1P_INLINE_H
+#define MATH_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+
+struct v_log1p_data
+{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+ int64x2_t one_top;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+ double ln2[2];
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
+#define V_LOG1P_CONSTANTS_TABLE \
+ { \
+ .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
+ .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
+ .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
+ .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
+ .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
+ .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
+ .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
+ .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
+ .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
+ .c18 = -0x1.cfa7385bdb37ep-6, \
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
+ .hf_rt2_top = V2 (0x3fe6a09e00000000), \
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
+ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
+ }
+
+#define BottomMask v_u64 (0xffffffff)
+
+static inline float64x2_t
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1718 = vld1q_f64 (&d->c17);
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
+ p = vfmaq_f64 (p1415, m2, p);
+ p = vfmaq_f64 (p1213, m2, p);
+ p = vfmaq_f64 (p1011, m2, p);
+ p = vfmaq_f64 (p89, m2, p);
+ p = vfmaq_f64 (p67, m2, p);
+ p = vfmaq_f64 (p45, m2, p);
+ p = vfmaq_f64 (p23, m2, p);
+ return vfmaq_f64 (p01, m2, p);
+}
+
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+{
+ /* Helper for calculating log(x + 1):
+ - No special-case handling - this should be dealt with by the caller.
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
+ using v_sel, for improved accuracy when the argument to log1p is close
+ to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
+ in the source of the caller before including this file. */
+ float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+ int64x2_t ki
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+ float64x2_t k = vcvtq_f64_s64 (ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
+
+ /* Correction term c/m. */
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+# error \
+ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+ that the approximation is solely the polynomial. */
+ uint64x2_t k0 = vceqzq_f64 (k);
+ cm = v_zerofy_f64 (cm, k0);
+ f = vbslq_f64 (k0, x, f);
+#endif
+
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t p = eval_poly (f, f2, d);
+
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+ float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
+ float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
+ return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+}
+
+#endif // MATH_V_LOG1P_INLINE_H
diff --git a/math/aarch64/advsimd/v_log1pf_inline.h b/math/aarch64/advsimd/v_log1pf_inline.h
new file mode 100644
index 000000000000..e81fa24486ae
--- /dev/null
+++ b/math/aarch64/advsimd/v_log1pf_inline.h
@@ -0,0 +1,94 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_V_LOG1PF_INLINE_H
+#define MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+struct v_log1pf_data
+{
+ uint32x4_t four;
+ int32x4_t three_quarters;
+ float c0, c3, c5, c7;
+ float32x4_t c4, c6, c1, c2, ln2;
+};
+
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+ (1, -0.5) are not stored as they can be generated more efficiently. */
+#define V_LOG1PF_CONSTANTS_TABLE \
+ { \
+ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
+ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
+ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
+ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
+ }
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float32x4_t c0357 = vld1q_f32 (&d->c0);
+ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
+ float32x4_t m2 = vmulq_f32 (m, m);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+ float32x4_t p = vfmaq_f32 (p45, m2, p67);
+ p = vfmaq_f32 (p23, m2, p);
+ p = vfmaq_f32 (d->c1, m, p);
+ p = vmulq_f32 (m2, p);
+ p = vfmaq_f32 (m, m2, p);
+ return vfmaq_f32 (p, m2, q);
+}
+
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
+{
+ /* Helper for calculating log(x + 1). */
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ int32x4_t k
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+ v_s32 (0xff800000));
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+ /* Scale x by exponent manipulation. */
+ float32x4_t m_scale
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+
+ /* Evaluate polynomial on the reduced interval. */
+ float32x4_t p = eval_poly (m_scale, d);
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+
+ /* Apply the scaling back. */
+ return vfmaq_f32 (p, scale_back, d->ln2);
+}
+
+#endif // MATH_V_LOG1PF_INLINE_H
diff --git a/pl/math/v_log_inline.h b/math/aarch64/advsimd/v_log_inline.h
index 2df00cf4ddf4..770f9e81c195 100644
--- a/pl/math/v_log_inline.h
+++ b/math/aarch64/advsimd/v_log_inline.h
@@ -1,7 +1,7 @@
/*
* Double-precision vector log(x) function - inline version
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -57,8 +57,8 @@ log_lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
diff --git a/pl/math/v_math.h b/math/aarch64/advsimd/v_math.h
index 1b10929faccc..75cd71cc87a7 100644
--- a/pl/math/v_math.h
+++ b/math/aarch64/advsimd/v_math.h
@@ -1,36 +1,63 @@
/*
* Vector math abstractions.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _V_MATH_H
#define _V_MATH_H
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
+#if !__aarch64__
+# error "Cannot build without AArch64"
#endif
-#if WANT_VMATH
-
-# if __aarch64__
-# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-# else
-# error "Cannot build without AArch64"
-# endif
-
-# include <stdint.h>
-# include "math_config.h"
-# if __aarch64__
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
+#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun
+
+#if USE_GLIBC_ABI
+
+# define HALF_WIDTH_ALIAS_F1(fun) \
+ float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \
+ { \
+ return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \
+ }
+
+# define HALF_WIDTH_ALIAS_F2(fun) \
+ float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \
+ { \
+ return vget_low_f32 ( \
+ _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \
+ }
+
+#else
+# define HALF_WIDTH_ALIAS_F1(fun)
+# define HALF_WIDTH_ALIAS_F2(fun)
+#endif
-# include <arm_neon.h>
+#include <stdint.h>
+#include "math_config.h"
+#include <arm_neon.h>
/* Shorthand helpers for declaring constants. */
-# define V2(X) { X, X }
-# define V4(X) { X, X, X, X }
-# define V8(X) { X, X, X, X, X, X, X, X }
+#define V2(X) \
+ { \
+ X, X \
+ }
+#define V4(X) \
+ { \
+ X, X, X, X \
+ }
+#define V8(X) \
+ { \
+ X, X, X, X, X, X, X, X \
+ }
static inline int
v_any_u16h (uint16x4_t x)
@@ -38,6 +65,12 @@ v_any_u16h (uint16x4_t x)
return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
}
+static inline int
+v_lanes32 (void)
+{
+ return 4;
+}
+
static inline float32x4_t
v_f32 (float x)
{
@@ -54,7 +87,7 @@ v_s32 (int32_t x)
return (int32x4_t) V4 (x);
}
-/* true if any elements of a vector compare result is non-zero. */
+/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u32 (uint32x4_t x)
{
@@ -97,6 +130,11 @@ v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
}
+static inline int
+v_lanes64 (void)
+{
+ return 2;
+}
static inline float64x2_t
v_f64 (double x)
{
@@ -113,20 +151,13 @@ v_s64 (int64_t x)
return (int64x2_t) V2 (x);
}
-/* true if any elements of a vector compare result is non-zero. */
+/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (uint64x2_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (x) != 0;
}
-/* true if all elements of a vector compare result is 1. */
-static inline int
-v_all_u64 (uint64x2_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
-}
static inline float64x2_t
v_lookup_f64 (const double *tab, uint64x2_t idx)
{
@@ -137,7 +168,6 @@ v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
{
return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
}
-
static inline float64x2_t
v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
{
@@ -169,7 +199,4 @@ v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
}
-# endif
-#endif
-
#endif
diff --git a/pl/math/poly_advsimd_f32.h b/math/aarch64/advsimd/v_poly_f32.h
index 438e153dff90..9a9c5c1ac15b 100644
--- a/pl/math/poly_advsimd_f32.h
+++ b/math/aarch64/advsimd/v_poly_f32.h
@@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on single-precision AdvSIMD input, using
* various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_POLY_ADVSIMD_F32_H
-#define PL_MATH_POLY_ADVSIMD_F32_H
+#ifndef MATH_POLY_ADVSIMD_F32_H
+#define MATH_POLY_ADVSIMD_F32_H
#include <arm_neon.h>
diff --git a/pl/math/poly_advsimd_f64.h b/math/aarch64/advsimd/v_poly_f64.h
index 7ea249a91225..4331bfbd03b0 100644
--- a/pl/math/poly_advsimd_f64.h
+++ b/math/aarch64/advsimd/v_poly_f64.h
@@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on double-precision AdvSIMD input, using
* various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_POLY_ADVSIMD_F64_H
-#define PL_MATH_POLY_ADVSIMD_F64_H
+#ifndef MATH_POLY_ADVSIMD_F64_H
+#define MATH_POLY_ADVSIMD_F64_H
#include <arm_neon.h>
diff --git a/pl/math/v_sincos_common.h b/math/aarch64/advsimd/v_sincos_common.h
index ee7937e0785a..14227d9339a8 100644
--- a/pl/math/v_sincos_common.h
+++ b/math/aarch64/advsimd/v_sincos_common.h
@@ -1,12 +1,12 @@
/*
* Core approximation for double-precision vector sincos
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "poly_advsimd_f64.h"
+#include "v_poly_f64.h"
static const struct v_sincos_data
{
diff --git a/pl/math/v_sincosf_common.h b/math/aarch64/advsimd/v_sincosf_common.h
index 8239bd9f0176..7c29eded14d6 100644
--- a/pl/math/v_sincosf_common.h
+++ b/math/aarch64/advsimd/v_sincosf_common.h
@@ -1,7 +1,7 @@
/*
* Core approximation for single-precision vector sincos
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/aarch64/advsimd/v_sincospi_common.h b/math/aarch64/advsimd/v_sincospi_common.h
new file mode 100644
index 000000000000..438b141b9174
--- /dev/null
+++ b/math/aarch64/advsimd/v_sincospi_common.h
@@ -0,0 +1,64 @@
+/*
+ * Helper for Double-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "v_poly_f64.h"
+
+static const struct v_sincospi_data
+{
+ float64x2_t poly[10], range_val;
+} v_sincospi_data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+ V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+ V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+ V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+ V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+ .range_val = V2 (0x1p63),
+};
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using separate argument reduction and shared low-order
+ polynomials.
+ Approximation for vector double-precision sincospi(x).
+ Maximum Error 3.09 ULP:
+ _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+ want 0x1.fd54d0b327cf4p-1
+ Maximum Error 3.16 ULP:
+ _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+ want 0x1.fd2da484ff402p-1. */
+static inline float64x2x2_t
+v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d)
+{
+ /* If r is odd, the sign of the result should be inverted for sinpi
+ and reintroduced for cospi. */
+ uint64x2_t cmp = vcgeq_f64 (x, d->range_val);
+ uint64x2_t odd = vshlq_n_u64 (
+ vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63);
+
+ /* r = x - rint(x). */
+ float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x));
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float64x2_t sr2 = vmulq_f64 (sr, sr);
+ float64x2_t sr4 = vmulq_f64 (sr2, sr2);
+ float64x2_t cr2 = vmulq_f64 (cr, cr);
+ float64x2_t cr4 = vmulq_f64 (cr2, cr2);
+
+ float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr);
+ float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr);
+
+ float64x2_t sinpix
+ = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd));
+
+ float64x2_t cospix
+ = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd));
+
+ return (float64x2x2_t){ sinpix, cospix };
+}
diff --git a/math/aarch64/advsimd/v_sincospif_common.h b/math/aarch64/advsimd/v_sincospif_common.h
new file mode 100644
index 000000000000..8d4177dd871e
--- /dev/null
+++ b/math/aarch64/advsimd/v_sincospif_common.h
@@ -0,0 +1,57 @@
+/*
+ * Helper for Single-precision vector sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_poly_f32.h"
+
+const static struct v_sincospif_data
+{
+ float32x4_t poly[6], range_val;
+} v_sincospif_data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+ V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+ .range_val = V4 (0x1p31f),
+};
+
+/* Single-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.04 ULP:
+ _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ Worst-case error for cos is 3.18 ULP:
+ _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ */
+static inline float32x4x2_t
+v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d)
+{
+ /* If r is odd, the sign of the result should be inverted for sinpi and
+ reintroduced for cospi. */
+ uint32x4_t cmp = vcgeq_f32 (x, d->range_val);
+ uint32x4_t odd = vshlq_n_u32 (
+ vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31);
+
+ /* r = x - rint(x). */
+ float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x));
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float32x4_t sr2 = vmulq_f32 (sr, sr);
+ float32x4_t sr4 = vmulq_f32 (sr2, sr2);
+ float32x4_t cr2 = vmulq_f32 (cr, cr);
+ float32x4_t cr4 = vmulq_f32 (cr2, cr2);
+
+ float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr);
+ float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr);
+
+ float32x4_t sinpix
+ = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd));
+ float32x4_t cospix
+ = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd));
+
+ return (float32x4x2_t){ sinpix, cospix };
+}
diff --git a/pl/math/cospi_3u1.c b/math/aarch64/cospi_3u5.c
index 4a688a076829..4131f6c816a1 100644
--- a/pl/math/cospi_3u1.c
+++ b/math/aarch64/cospi_3u5.c
@@ -1,14 +1,14 @@
/*
* Double-precision scalar cospi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "poly_scalar_f64.h"
/* Taylor series coefficents for sin(pi * x).
@@ -29,9 +29,9 @@ static const double poly[]
cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
want 0x1.fffffffffd16ep-1. */
double
-cospi (double x)
+arm_math_cospi (double x)
{
- if (isinf (x))
+ if (isinf (x) || isnan (x))
return __math_invalid (x);
double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
@@ -81,9 +81,18 @@ cospi (double x)
return asdouble (asuint64 (y) ^ sign);
}
-PL_SIG (S, D, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (cospi, 2.63)
-PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
-PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)
+#if WANT_EXPERIMENTAL_MATH
+double
+cospi (double x)
+{
+ return arm_math_cospi (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_cospi, 2.63)
+TEST_SYM_INTERVAL (arm_math_cospi, 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (arm_math_cospi, 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (arm_math_cospi, 0.5, 0x1p51f, 10000)
+TEST_SYM_INTERVAL (arm_math_cospi, 0x1p51f, inf, 10000)
+#endif
diff --git a/pl/math/cospif_2u6.c b/math/aarch64/cospif_2u6.c
index d78a98ed8b2d..eb5b75402a63 100644
--- a/pl/math/cospif_2u6.c
+++ b/math/aarch64/cospif_2u6.c
@@ -1,14 +1,14 @@
/*
* Single-precision scalar cospi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Taylor series coefficents for sin(pi * x). */
#define C0 0x1.921fb6p1f
@@ -25,9 +25,9 @@
cospif(0x1.37e844p-4) got 0x1.f16b3p-1
want 0x1.f16b2ap-1. */
float
-cospif (float x)
+arm_math_cospif (float x)
{
- if (isinf (x))
+ if (isinf (x) || isnan (x))
return __math_invalidf (x);
float ax = asfloat (asuint (x) & ~0x80000000);
@@ -76,9 +76,18 @@ cospif (float x)
return asfloat (asuint (y * r) ^ sign);
}
-PL_SIG (S, F, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (cospif, 2.15)
-PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
-PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)
+#if WANT_EXPERIMENTAL_MATH
+float
+cospif (float x)
+{
+ return arm_math_cospif (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_cospif, 2.15)
+TEST_SYM_INTERVAL (arm_math_cospif, 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (arm_math_cospif, 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (arm_math_cospif, 0.5, 0x1p22f, 10000)
+TEST_SYM_INTERVAL (arm_math_cospif, 0x1p22f, inf, 10000)
+#endif
diff --git a/pl/README.contributors b/math/aarch64/experimental/README.contributors
index 3af9b1fc7741..abb749485ba3 100644
--- a/pl/README.contributors
+++ b/math/aarch64/experimental/README.contributors
@@ -5,7 +5,6 @@ glibc-specific conventions need not be followed.
The requirements for portable code apply to non-portable code with the
following differences:
-
1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There
are no specific restrictions on acceptable ULP error, but if functions
provide significantly less accuracy than portable equivalents then a clear
@@ -15,9 +14,3 @@ following differences:
2. Functions are assumed to support round-to-nearest mode by default, unless
stated; other rounding modes are not required to be provided.
-
-3. Handling of special cases may be relaxed for vector functions. Checking
- whether each vector lane contains special values such as NaN, Inf or
- denormal numbers can prove too costly for vector functions. This is often
- not required since vector functions are typically used along with aggressive
- compiler optimization flags.
diff --git a/pl/math/acos_2u.c b/math/aarch64/experimental/acos_2u.c
index 9ec6894f1d81..062215c92248 100644
--- a/pl/math/acos_2u.c
+++ b/math/aarch64/experimental/acos_2u.c
@@ -1,23 +1,23 @@
/*
* Double-precision acos(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "poly_scalar_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define AbsMask (0x7fffffffffffffff)
-#define Half (0x3fe0000000000000)
-#define One (0x3ff0000000000000)
-#define PiOver2 (0x1.921fb54442d18p+0)
-#define Pi (0x1.921fb54442d18p+1)
-#define Small (0x3c90000000000000) /* 2^-53. */
-#define Small16 (0x3c90)
-#define QNaN (0x7ff8)
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define PiOver2 0x1.921fb54442d18p+0
+#define Pi 0x1.921fb54442d18p+1
+#define Small 0x3c90000000000000 /* 2^-53. */
+#define Small16 0x3c90
+#define QNaN 0x7ff8
/* Fast implementation of double-precision acos(x) based on polynomial
approximation of double-precision asin(x).
@@ -29,8 +29,8 @@
acos(x) = pi/2 - asin(x)
- and use an order 11 polynomial P such that the final approximation of asin is
- an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
+ and use an order 11 polynomial P such that the final approximation of asin
+ is an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
The largest observed error in this region is 1.18 ulps,
acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
@@ -90,11 +90,11 @@ acos (double x)
return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
}
-PL_SIG (S, D, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (acos, 1.02)
-PL_TEST_INTERVAL (acos, 0, Small, 5000)
-PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
-PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (acos, -0, -inf, 20000)
+TEST_SIG (S, D, 1, acos, -1.0, 1.0)
+TEST_ULP (acos, 1.02)
+TEST_INTERVAL (acos, 0, Small, 5000)
+TEST_INTERVAL (acos, Small, 0.5, 50000)
+TEST_INTERVAL (acos, 0.5, 1.0, 50000)
+TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (acos, 0x1p11, inf, 20000)
+TEST_INTERVAL (acos, -0, -inf, 20000)
diff --git a/pl/math/acosf_1u4.c b/math/aarch64/experimental/acosf_1u4.c
index 6dde422ef85a..d207f5e89f26 100644
--- a/pl/math/acosf_1u4.c
+++ b/math/aarch64/experimental/acosf_1u4.c
@@ -1,23 +1,23 @@
/*
* Single-precision acos(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define AbsMask (0x7fffffff)
-#define Half (0x3f000000)
-#define One (0x3f800000)
-#define PiOver2f (0x1.921fb6p+0f)
-#define Pif (0x1.921fb6p+1f)
-#define Small (0x32800000) /* 2^-26. */
-#define Small12 (0x328)
-#define QNaN (0x7fc)
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define PiOver2f 0x1.921fb6p+0f
+#define Pif 0x1.921fb6p+1f
+#define Small 0x32800000 /* 2^-26. */
+#define Small12 0x328
+#define QNaN 0x7fc
/* Fast implementation of single-precision acos(x) based on polynomial
approximation of single-precision asin(x).
@@ -89,11 +89,11 @@ acosf (float x)
return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
}
-PL_SIG (S, F, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (acosf, 0.82)
-PL_TEST_INTERVAL (acosf, 0, Small, 5000)
-PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
-PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (acosf, -0, -inf, 20000)
+TEST_SIG (S, F, 1, acos, -1.0, 1.0)
+TEST_ULP (acosf, 0.82)
+TEST_INTERVAL (acosf, 0, Small, 5000)
+TEST_INTERVAL (acosf, Small, 0.5, 50000)
+TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
+TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
+TEST_INTERVAL (acosf, -0, -inf, 20000)
diff --git a/pl/math/acosh_3u.c b/math/aarch64/experimental/acosh_3u.c
index 4e2cb6737ba8..19da82f4f3e5 100644
--- a/pl/math/acosh_3u.c
+++ b/math/aarch64/experimental/acosh_3u.c
@@ -1,31 +1,26 @@
/*
* Double-precision acosh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Ln2 (0x1.62e42fefa39efp-1)
#define MinusZero (0x8000000000000000)
#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511). */
#define Two (0x4000000000000000) /* asuint64(2.0). */
-double
-optr_aor_log_f64 (double);
-
-double
-log1p (double);
-
/* acosh approximation using a variety of approaches on different intervals:
acosh(x) = ln(x + sqrt(x * x - 1)).
- x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
- close enough to x that we can calculate the result by ln(2x) == ln(x) +
+ x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1)
+ is close enough to x that we can calculate the result by ln(2x) == ln(x) +
ln(2). The greatest observed error in this region is 0.98 ULP:
acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9
want 0x1.28066a11a7c8p+9.
@@ -48,19 +43,19 @@ acosh (double x)
return __math_invalid (x);
if (unlikely (ix >= SquareLim))
- return optr_aor_log_f64 (x) + Ln2;
+ return log (x) + Ln2;
if (ix >= Two)
- return optr_aor_log_f64 (x + sqrt (x * x - 1));
+ return log (x + sqrt (x * x - 1));
double xm1 = x - 1;
return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1));
}
-PL_SIG (S, D, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (acosh, 2.19)
-PL_TEST_INTERVAL (acosh, 0, 1, 10000)
-PL_TEST_INTERVAL (acosh, 1, 2, 100000)
-PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
-PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
-PL_TEST_INTERVAL (acosh, -0, -inf, 10000)
+TEST_SIG (S, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (acosh, 2.19)
+TEST_INTERVAL (acosh, 0, 1, 10000)
+TEST_INTERVAL (acosh, 1, 2, 100000)
+TEST_INTERVAL (acosh, 2, 0x1p511, 100000)
+TEST_INTERVAL (acosh, 0x1p511, inf, 100000)
+TEST_INTERVAL (acosh, -0, -inf, 10000)
diff --git a/pl/math/acoshf_2u8.c b/math/aarch64/experimental/acoshf_2u8.c
index c9cded7fd2ff..a46b310ee312 100644
--- a/pl/math/acoshf_2u8.c
+++ b/math/aarch64/experimental/acoshf_2u8.c
@@ -1,27 +1,19 @@
/*
* Single-precision acosh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Ln2 (0x1.62e4p-1f)
#define MinusZero 0x80000000
#define SquareLim 0x5f800000 /* asuint(0x1p64). */
#define Two 0x40000000
-/* Single-precision log from math/. */
-float
-optr_aor_log_f32 (float);
-
-/* Single-precision log(1+x) from pl/math. */
-float
-log1pf (float);
-
/* acoshf approximation using a variety of approaches on different intervals:
x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is
@@ -45,19 +37,19 @@ acoshf (float x)
return __math_invalidf (x);
if (unlikely (ix >= SquareLim))
- return optr_aor_log_f32 (x) + Ln2;
+ return logf (x) + Ln2;
if (ix > Two)
- return optr_aor_log_f32 (x + sqrtf (x * x - 1));
+ return logf (x + sqrtf (x * x - 1));
float xm1 = x - 1;
return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1));
}
-PL_SIG (S, F, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (acoshf, 2.30)
-PL_TEST_INTERVAL (acoshf, 0, 1, 100)
-PL_TEST_INTERVAL (acoshf, 1, 2, 10000)
-PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
-PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
-PL_TEST_INTERVAL (acoshf, -0, -inf, 10000)
+TEST_SIG (S, F, 1, acosh, 1.0, 10.0)
+TEST_ULP (acoshf, 2.30)
+TEST_INTERVAL (acoshf, 0, 1, 100)
+TEST_INTERVAL (acoshf, 1, 2, 10000)
+TEST_INTERVAL (acoshf, 2, 0x1p64, 100000)
+TEST_INTERVAL (acoshf, 0x1p64, inf, 100000)
+TEST_INTERVAL (acoshf, -0, -inf, 10000)
diff --git a/pl/math/v_erfinv_25u.c b/math/aarch64/experimental/advsimd/erfinv_25u.c
index 654a7336e85b..2fa2f0beb8b7 100644
--- a/pl/math/v_erfinv_25u.c
+++ b/math/aarch64/experimental/advsimd/erfinv_25u.c
@@ -1,15 +1,15 @@
/*
* Double-precision inverse error function (AdvSIMD variant).
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_test.h"
+#include "test_defs.h"
#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "poly_advsimd_f64.h"
+#include "test_sig.h"
+#include "v_poly_f64.h"
#define V_LOG_INLINE_POLY_ORDER 4
#include "v_log_inline.h"
@@ -22,7 +22,7 @@ const static struct data
can be taken. */
double P[8][2], Q[7][2];
float64x2_t tailshift;
- uint8x16_t idx;
+ uint8_t idx[16];
struct v_log_inline_data log_tbl;
float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
@@ -58,7 +58,7 @@ const static struct data
V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
.tailshift = V2 (-0.87890625),
- .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
.log_tbl = V_LOG_CONSTANTS };
static inline float64x2_t
@@ -128,7 +128,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));
uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
- uint8x16_t idx = vaddq_u8 (d->idx, off);
+ uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off);
float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
t = vfmaq_f64 (t, x, x);
@@ -150,12 +150,17 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
return vdivq_f64 (p, q);
}
-PL_SIG (V, D, 1, erfinv, -0.99, 0.99)
-PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8)
+#if USE_MPFR
+# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference
+#else
+TEST_SIG (V, D, 1, erfinv, -0.99, 0.99)
+TEST_ULP (V_NAME_D1 (erfinv), 24.8)
+TEST_DISABLE_FENV (V_NAME_D1 (erfinv))
+TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
+TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000)
/* Test with control lane in each interval. */
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
- 0.5)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
- 0.8)
-PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
- 0.95)
+TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5)
+TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8)
+TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95)
+#endif
diff --git a/pl/math/v_erfinvf_5u.c b/math/aarch64/experimental/advsimd/erfinvf_5u.c
index 5a6800b86ae9..254d50feb289 100644
--- a/pl/math/v_erfinvf_5u.c
+++ b/math/aarch64/experimental/advsimd/erfinvf_5u.c
@@ -1,13 +1,13 @@
/*
* Single-precision inverse error function (AdvSIMD variant).
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "v_poly_f32.h"
#include "v_logf_inline.h"
const static struct data
@@ -24,14 +24,15 @@ const static struct data
P_10 and Q_10 are also stored in homogenous vectors to allow better
memory access when no lanes are in a tail region. */
- float32x4_t Plo, PQ, Qhi, P29_3, tailshift;
+ float Plo[4], PQ[4], Qhi[4];
+ float32x4_t P29_3, tailshift;
float32x4_t P_50[6], Q_50[2];
float32x4_t P_10[3], Q_10[3];
- uint8x16_t idxhi, idxlo;
+ uint8_t idxhi[16], idxlo[16];
struct v_logf_data logf_tbl;
} data = {
- .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
- .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
+ .idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 },
+ .idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 },
.P29_3 = V4 (0x1.b13626p-2),
.tailshift = V4 (-0.87890625),
.Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
@@ -86,7 +87,7 @@ lookup (float32x4_t tbl, uint8x16_t idx)
tail region:
_ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
want 0x1.b4793ap+0 . */
-float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -124,18 +125,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
two pairs of coeffs, so we need two idx vectors - one for each pair. */
uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
- uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off);
- uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off);
+ uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off);
+ uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off);
/* Load the tables. */
- float32x4_t p_lo = d->Plo;
- float32x4_t pq = d->PQ;
- float32x4_t qhi = d->Qhi;
+ float32x4_t plo = vld1q_f32 (d->Plo);
+ float32x4_t pq = vld1q_f32 (d->PQ);
+ float32x4_t qhi = vld1q_f32 (d->Qhi);
/* Do the lookup (and calculate p3 by masking non-tail lanes). */
float32x4_t p3 = vreinterpretq_f32_u32 (
vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
- float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi),
+ float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi),
p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);
@@ -155,9 +156,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
return vdivq_f32 (p, q);
}
-PL_SIG (V, F, 1, erfinv, -0.99, 0.99)
-PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49)
+HALF_WIDTH_ALIAS_F1 (erfinv)
+
+#if USE_MPFR
+# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference
+#else
+TEST_SIG (V, F, 1, erfinv, -0.99, 0.99)
+TEST_DISABLE_FENV (V_NAME_F1 (erfinv))
+TEST_ULP (V_NAME_F1 (erfinv), 4.49)
+TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000)
/* Test with control lane in each interval. */
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8)
-PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95)
+TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5)
+TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8)
+TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95)
+#endif
diff --git a/pl/math/v_logf_inline.h b/math/aarch64/experimental/advsimd/v_logf_inline.h
index c00fe0909afc..3f4534173289 100644
--- a/pl/math/v_logf_inline.h
+++ b/math/aarch64/experimental/advsimd/v_logf_inline.h
@@ -1,7 +1,7 @@
/*
* Single-precision vector log function - inline version
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/asin_3u.c b/math/aarch64/experimental/asin_3u.c
index 0b50995449ce..56e63e451ba1 100644
--- a/pl/math/asin_3u.c
+++ b/math/aarch64/experimental/asin_3u.c
@@ -1,22 +1,22 @@
/*
* Double-precision asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f64.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
-#define AbsMask (0x7fffffffffffffff)
-#define Half (0x3fe0000000000000)
-#define One (0x3ff0000000000000)
-#define PiOver2 (0x1.921fb54442d18p+0)
-#define Small (0x3e50000000000000) /* 2^-26. */
-#define Small16 (0x3e50)
-#define QNaN (0x7ff8)
+#define AbsMask 0x7fffffffffffffff
+#define Half 0x3fe0000000000000
+#define One 0x3ff0000000000000
+#define PiOver2 0x1.921fb54442d18p+0
+#define Small 0x3e50000000000000 /* 2^-26. */
+#define Small16 0x3e50
+#define QNaN 0x7ff8
/* Fast implementation of double-precision asin(x) based on polynomial
approximation.
@@ -54,8 +54,8 @@
asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
The largest observed error in this region is 2.69 ulps,
- asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
- want 0x1.110d7e85fdd53p-1. */
+ asin(0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+ want 0x1.1111dd54ddf99p-1. */
double
asin (double x)
{
@@ -96,11 +96,11 @@ asin (double x)
return asdouble (asuint64 (y) | sign);
}
-PL_SIG (S, D, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (asin, 2.19)
-PL_TEST_INTERVAL (asin, 0, Small, 5000)
-PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
-PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (asin, -0, -inf, 20000)
+TEST_SIG (S, D, 1, asin, -1.0, 1.0)
+TEST_ULP (asin, 2.20)
+TEST_INTERVAL (asin, 0, Small, 5000)
+TEST_INTERVAL (asin, Small, 0.5, 50000)
+TEST_INTERVAL (asin, 0.5, 1.0, 50000)
+TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (asin, 0x1p11, inf, 20000)
+TEST_INTERVAL (asin, -0, -inf, 20000)
diff --git a/pl/math/asin_data.c b/math/aarch64/experimental/asin_data.c
index b5517731c7f4..60ab476e7ec9 100644
--- a/pl/math/asin_data.c
+++ b/math/aarch64/experimental/asin_data.c
@@ -1,7 +1,7 @@
/*
* Coefficients for single-precision asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/asinf_2u5.c b/math/aarch64/experimental/asinf_2u5.c
index ec608146ff66..1136da01550e 100644
--- a/pl/math/asinf_2u5.c
+++ b/math/aarch64/experimental/asinf_2u5.c
@@ -1,22 +1,22 @@
/*
* Single-precision asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
-#define AbsMask (0x7fffffff)
-#define Half (0x3f000000)
-#define One (0x3f800000)
-#define PiOver2f (0x1.921fb6p+0f)
-#define Small (0x39800000) /* 2^-12. */
-#define Small12 (0x398)
-#define QNaN (0x7fc)
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define PiOver2f 0x1.921fb6p+0f
+#define Small 0x39800000 /* 2^-12. */
+#define Small12 0x398
+#define QNaN 0x7fc
/* Fast implementation of single-precision asin(x) based on polynomial
approximation.
@@ -90,11 +90,11 @@ asinf (float x)
return asfloat (asuint (y) | sign);
}
-PL_SIG (S, F, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (asinf, 1.91)
-PL_TEST_INTERVAL (asinf, 0, Small, 5000)
-PL_TEST_INTERVAL (asinf, Small, 0.5, 50000)
-PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (asinf, -0, -inf, 20000)
+TEST_SIG (S, F, 1, asin, -1.0, 1.0)
+TEST_ULP (asinf, 1.91)
+TEST_INTERVAL (asinf, 0, Small, 5000)
+TEST_INTERVAL (asinf, Small, 0.5, 50000)
+TEST_INTERVAL (asinf, 0.5, 1.0, 50000)
+TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (asinf, 0x1p11, inf, 20000)
+TEST_INTERVAL (asinf, -0, -inf, 20000)
diff --git a/pl/math/asinf_data.c b/math/aarch64/experimental/asinf_data.c
index 1652025e2920..15f331dde5a7 100644
--- a/pl/math/asinf_data.c
+++ b/math/aarch64/experimental/asinf_data.c
@@ -1,7 +1,7 @@
/*
* Coefficients for single-precision asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/asinh_2u5.c b/math/aarch64/experimental/asinh_2u5.c
index b7fc81a2b94f..9d2d160a1453 100644
--- a/pl/math/asinh_2u5.c
+++ b/math/aarch64/experimental/asinh_2u5.c
@@ -1,13 +1,14 @@
/*
* Double-precision asinh(x) function
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "mathlib.h"
#include "poly_scalar_f64.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffffffffffff
#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */
@@ -15,9 +16,6 @@
#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */
#define Ln2 0x1.62e42fefa39efp-1
-double
-optr_aor_log_f64 (double);
-
/* Scalar double-precision asinh implementation. This routine uses different
approaches on different intervals:
@@ -67,19 +65,18 @@ asinh (double x)
if (unlikely (ia >= Exp511))
{
- return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign);
+ return asdouble (asuint64 (log (ax) + Ln2) | sign);
}
- return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1)))
- | sign);
+ return asdouble (asuint64 (log (ax + sqrt (ax * ax + 1))) | sign);
}
-PL_SIG (S, D, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (asinh, 1.54)
-PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000)
-PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000)
-PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000)
-PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000)
-PL_TEST_INTERVAL (asinh, 100.0, inf, 50000)
-PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000)
+TEST_SIG (S, D, 1, asinh, -10.0, 10.0)
+TEST_ULP (asinh, 1.54)
+TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000)
+TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000)
+TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000)
+TEST_INTERVAL (asinh, 1.0, 100.0, 40000)
+TEST_INTERVAL (asinh, -1.0, -100.0, 10000)
+TEST_INTERVAL (asinh, 100.0, inf, 50000)
+TEST_INTERVAL (asinh, -100.0, -inf, 10000)
diff --git a/pl/math/asinh_data.c b/math/aarch64/experimental/asinh_data.c
index 073b19799bda..7afaf6960130 100644
--- a/pl/math/asinh_data.c
+++ b/math/aarch64/experimental/asinh_data.c
@@ -1,7 +1,7 @@
/*
* Double-precision polynomial coefficients for scalar asinh(x)
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -13,10 +13,11 @@
Note P is evaluated on even powers of x only. See tools/asinh.sollya for the
algorithm used to generate these coefficients. */
const struct asinh_data __asinh_data
- = {.poly
- = {-0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
- 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
- -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
- 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
- -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
- 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18}};
+ = { .poly
+ = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+ 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+ -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+ 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+ -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+ 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14,
+ 0x1.93d4ba83d34dap-18 } };
diff --git a/pl/math/asinhf_3u5.c b/math/aarch64/experimental/asinhf_3u5.c
index ec26b80ec2ec..92c6dfd9b43d 100644
--- a/pl/math/asinhf_3u5.c
+++ b/math/aarch64/experimental/asinhf_3u5.c
@@ -1,14 +1,14 @@
/*
* Single-precision asinh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask (0x7fffffff)
#define SqrtFltMax (0x1.749e96p+10f)
@@ -16,9 +16,6 @@
#define One (0x3f8)
#define ExpM12 (0x398)
-float
-optr_aor_log_f32 (float);
-
/* asinhf approximation using a variety of approaches on different intervals:
|x| < 2^-12: Return x. Function is exactly rounded in this region.
@@ -62,15 +59,15 @@ asinhf (float x)
if (unlikely (ax > SqrtFltMax))
{
- return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign);
+ return asfloat (asuint (logf (ax) + Ln2) | sign);
}
- return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign);
+ return asfloat (asuint (logf (ax + sqrtf (ax * ax + 1))) | sign);
}
-PL_SIG (S, F, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (asinhf, 2.9)
-PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000)
-PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000)
-PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000)
+TEST_SIG (S, F, 1, asinh, -10.0, 10.0)
+TEST_ULP (asinhf, 2.9)
+TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000)
+TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000)
+TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000)
+TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000)
diff --git a/math/aarch64/experimental/asinhf_data.c b/math/aarch64/experimental/asinhf_data.c
new file mode 100644
index 000000000000..5ed261ba835b
--- /dev/null
+++ b/math/aarch64/experimental/asinhf_data.c
@@ -0,0 +1,15 @@
+/*
+ * Coefficients for single-precision asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya
+ for these coeffs were generated. */
+const struct asinhf_data __asinhf_data
+ = { .coeffs = { -0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f,
+ 0x1.3a81dcp-4f, 0x1.65bbaap-10f, -0x1.057f1p-4f,
+ 0x1.6c1d46p-5f, -0x1.4cafe8p-7f } };
diff --git a/pl/math/atan2_2u5.c b/math/aarch64/experimental/atan2_2u5.c
index c909ac99fa22..518e34589e5b 100644
--- a/pl/math/atan2_2u5.c
+++ b/math/aarch64/experimental/atan2_2u5.c
@@ -1,7 +1,7 @@
/*
* Double-precision scalar atan2(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,8 +9,8 @@
#include "atan_common.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Pi (0x1.921fb54442d18p+1)
#define PiOver2 (0x1.921fb54442d18p+0)
@@ -79,8 +79,8 @@ atan2 (double y, double x)
if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND))
return sign_y ? -PiOver2 : PiOver2;
- /* Special case for either x is INF or (x, y) is very close to x axis and x is
- negative. */
+ /* Special case for either x is INF or (x, y) is very close to x axis and x
+ is negative. */
if (unlikely (iax == 0x7ff0000000000000
|| (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2)))
{
@@ -150,10 +150,10 @@ atan2 (double y, double x)
}
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
-PL_SIG (S, D, 2, atan2)
-PL_TEST_ULP (atan2, 1.78)
-PL_TEST_INTERVAL (atan2, -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (atan2, -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (atan2, 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (atan2, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (atan2, 1e6, 1e32, 40000)
+TEST_SIG (S, D, 2, atan2)
+TEST_ULP (atan2, 1.78)
+TEST_INTERVAL (atan2, -10.0, 10.0, 50000)
+TEST_INTERVAL (atan2, -1.0, 1.0, 40000)
+TEST_INTERVAL (atan2, 0.0, 1.0, 40000)
+TEST_INTERVAL (atan2, 1.0, 100.0, 40000)
+TEST_INTERVAL (atan2, 1e6, 1e32, 40000)
diff --git a/pl/math/atan2f_3u.c b/math/aarch64/experimental/atan2f_3u.c
index 38e1df59c102..245ba551566c 100644
--- a/pl/math/atan2f_3u.c
+++ b/math/aarch64/experimental/atan2f_3u.c
@@ -1,7 +1,7 @@
/*
* Single-precision scalar atan2(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,8 +9,8 @@
#include "atanf_common.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Pi (0x1.921fb6p+1f)
#define PiOver2 (0x1.921fb6p+0f)
@@ -19,8 +19,8 @@
/* We calculate atan2f by P(n/d), where n and d are similar to the input
arguments, and P is a polynomial. The polynomial may underflow.
- POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and d
- for which P underflows, and is used to special-case such inputs. */
+ POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and
+ d for which P underflows, and is used to special-case such inputs. */
#define POLY_UFLOW_BOUND 24
static inline int32_t
@@ -158,10 +158,10 @@ atan2f (float y, float x)
}
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
-PL_SIG (S, F, 2, atan2)
-PL_TEST_ULP (atan2f, 2.4)
-PL_TEST_INTERVAL (atan2f, -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (atan2f, -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (atan2f, 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (atan2f, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (atan2f, 1e6, 1e32, 40000)
+TEST_SIG (S, F, 2, atan2)
+TEST_ULP (atan2f, 2.4)
+TEST_INTERVAL (atan2f, -10.0, 10.0, 50000)
+TEST_INTERVAL (atan2f, -1.0, 1.0, 40000)
+TEST_INTERVAL (atan2f, 0.0, 1.0, 40000)
+TEST_INTERVAL (atan2f, 1.0, 100.0, 40000)
+TEST_INTERVAL (atan2f, 1e6, 1e32, 40000)
diff --git a/pl/math/atan_2u5.c b/math/aarch64/experimental/atan_2u5.c
index ee4770101758..9c9c77d98cd3 100644
--- a/pl/math/atan_2u5.c
+++ b/math/aarch64/experimental/atan_2u5.c
@@ -1,12 +1,12 @@
/*
* Double-precision atan(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "atan_common.h"
#define AbsMask 0x7fffffffffffffff
@@ -63,11 +63,11 @@ atan (double x)
return asdouble (asuint64 (y) ^ sign);
}
-PL_SIG (S, D, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (atan, 1.78)
-PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
-PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
-PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
-PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
-PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000)
-PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
+TEST_SIG (S, D, 1, atan, -10.0, 10.0)
+TEST_ULP (atan, 1.78)
+TEST_INTERVAL (atan, 0, 0x1p-30, 10000)
+TEST_INTERVAL (atan, -0, -0x1p-30, 1000)
+TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000)
+TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000)
+TEST_INTERVAL (atan, 0x1p53, inf, 10000)
+TEST_INTERVAL (atan, -0x1p53, -inf, 1000)
diff --git a/pl/math/atan_common.h b/math/aarch64/experimental/atan_common.h
index 798cc22cc40a..1fd83860219b 100644
--- a/pl/math/atan_common.h
+++ b/math/aarch64/experimental/atan_common.h
@@ -2,7 +2,7 @@
* Double-precision polynomial evaluation function for scalar
* atan(x) and atan2(y,x).
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/aarch64/experimental/atan_data.c b/math/aarch64/experimental/atan_data.c
new file mode 100644
index 000000000000..5d24fa912d02
--- /dev/null
+++ b/math/aarch64/experimental/atan_data.c
@@ -0,0 +1,23 @@
+/*
+ * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct atan_poly_data __atan_poly_data
+ = { .poly = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2)
+ on [2**-1022, 1.0]. See atan.sollya for details of how
+ these were generated. */
+ -0x1.5555555555555p-2, 0x1.99999999996c1p-3,
+ -0x1.2492492478f88p-3, 0x1.c71c71bc3951cp-4,
+ -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+ -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5,
+ -0x1.aebfe7b418581p-5, 0x1.842dbe9b0d916p-5,
+ -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+ -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6,
+ -0x1.0051381722a59p-6, 0x1.14e9dc19a4a4ep-7,
+ -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+ -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16 } };
diff --git a/pl/math/atanf_2u9.c b/math/aarch64/experimental/atanf_2u9.c
index ba6f68089de1..518415ded634 100644
--- a/pl/math/atanf_2u9.c
+++ b/math/aarch64/experimental/atanf_2u9.c
@@ -1,13 +1,13 @@
/*
* Single-precision atan(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "atanf_common.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define PiOver2 0x1.921fb6p+0f
#define AbsMask 0x7fffffff
@@ -64,9 +64,9 @@ atanf (float x)
return asfloat (asuint (y) ^ sign);
}
-PL_SIG (S, F, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (atanf, 2.38)
-PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000)
-PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000)
-PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000)
-PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000)
+TEST_SIG (S, F, 1, atan, -10.0, 10.0)
+TEST_ULP (atanf, 2.38)
+TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000)
+TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000)
+TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000)
+TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000)
diff --git a/pl/math/atanf_common.h b/math/aarch64/experimental/atanf_common.h
index 8952e7e0078b..3e6542047309 100644
--- a/pl/math/atanf_common.h
+++ b/math/aarch64/experimental/atanf_common.h
@@ -2,7 +2,7 @@
* Single-precision polynomial evaluation function for scalar
* atan(x) and atan2(y,x).
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/aarch64/experimental/atanf_data.c b/math/aarch64/experimental/atanf_data.c
new file mode 100644
index 000000000000..f4d607c2a12d
--- /dev/null
+++ b/math/aarch64/experimental/atanf_data.c
@@ -0,0 +1,17 @@
+/*
+ * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x).
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0].
+ */
+const struct atanf_poly_data __atanf_poly_data
+ = { .poly
+ = { /* See atanf.sollya for details of how these were generated. */
+ -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+ -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f,
+ 0x1.01fd88p-8f } };
diff --git a/pl/math/atanh_3u.c b/math/aarch64/experimental/atanh_3u.c
index dcfbe8192a22..d01b8bacd46a 100644
--- a/pl/math/atanh_3u.c
+++ b/math/aarch64/experimental/atanh_3u.c
@@ -1,21 +1,21 @@
/*
* Double-precision atanh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "poly_scalar_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffffffffffff
#define Half 0x3fe0000000000000
#define One 0x3ff0000000000000
#define Ln2Hi 0x1.62e42fefa3800p-1
#define Ln2Lo 0x1.ef35793c76730p-45
-#define OneMHfRt2Top \
+#define OneMHfRt2Top \
0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */
#define OneTop12 0x3ff
#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */
@@ -76,8 +76,8 @@ atanh (double x)
return halfsign * log1p_inline ((2 * ax) / (1 - ax));
}
-PL_SIG (S, D, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (atanh, 3.00)
-PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000)
-PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000)
-PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100)
+TEST_SIG (S, D, 1, atanh, -1.0, 1.0)
+TEST_ULP (atanh, 3.00)
+TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000)
+TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000)
+TEST_SYM_INTERVAL (atanh, 1, inf, 100)
diff --git a/pl/math/atanhf_3u1.c b/math/aarch64/experimental/atanhf_3u1.c
index e99d5a9900a9..c452bab91f97 100644
--- a/pl/math/atanhf_3u1.c
+++ b/math/aarch64/experimental/atanhf_3u1.c
@@ -1,14 +1,14 @@
/*
* Single-precision atanh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffff
#define Half 0x3f000000
@@ -79,8 +79,8 @@ atanhf (float x)
return halfsign * log1pf_inline ((2 * ax) / (1 - ax));
}
-PL_SIG (S, F, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (atanhf, 2.59)
-PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500)
-PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000)
-PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000)
+TEST_SIG (S, F, 1, atanh, -1.0, 1.0)
+TEST_ULP (atanhf, 2.59)
+TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500)
+TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000)
+TEST_SYM_INTERVAL (atanhf, 1, inf, 1000)
diff --git a/pl/math/cbrt_2u.c b/math/aarch64/experimental/cbrt_2u.c
index 80be83c4470c..cf31627e43dc 100644
--- a/pl/math/cbrt_2u.c
+++ b/math/aarch64/experimental/cbrt_2u.c
@@ -1,15 +1,15 @@
/*
* Double-precision cbrt(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
-PL_SIG (S, D, 1, cbrt, -10.0, 10.0)
+TEST_SIG (S, D, 1, cbrt, -10.0, 10.0)
#define AbsMask 0x7fffffffffffffff
#define TwoThirds 0x1.5555555555555p-1
@@ -39,8 +39,8 @@ cbrt (double x)
int e;
double m = frexp (asdouble (iax), &e);
- /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
- Newton iterations. */
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+ for Newton iterations. */
double p_01 = fma (C (1), m, C (0));
double p_23 = fma (C (3), m, C (2));
double p = fma (p_23, m * m, p_01);
@@ -65,5 +65,5 @@ cbrt (double x)
return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign);
}
-PL_TEST_ULP (cbrt, 1.30)
-PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000)
+TEST_ULP (cbrt, 1.30)
+TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000)
diff --git a/pl/math/cbrt_data.c b/math/aarch64/experimental/cbrt_data.c
index 3d484c2779e2..dabcb6aff2d4 100644
--- a/pl/math/cbrt_data.c
+++ b/math/aarch64/experimental/cbrt_data.c
@@ -1,7 +1,7 @@
/*
* Coefficients and table entries for double-precision cbrt(x).
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/cbrtf_1u5.c b/math/aarch64/experimental/cbrtf_1u5.c
index 88fcb7162ef6..5f0288e6d27a 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/math/aarch64/experimental/cbrtf_1u5.c
@@ -1,14 +1,14 @@
/*
* Single-precision cbrt(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffff
#define SignMask 0x80000000
@@ -18,8 +18,8 @@
/* Approximation for single-precision cbrt(x), using low-order polynomial and
one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This
- is observed for every value where the mantissa is 0x1.81410e and the exponent
- is a multiple of 3, for example:
+ is observed for every value where the mantissa is 0x1.81410e and the
+ exponent is a multiple of 3, for example:
cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
want 0x1.255d92p+10. */
float
@@ -61,6 +61,6 @@ cbrtf (float x)
return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign);
}
-PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
-PL_TEST_ULP (cbrtf, 1.03)
-PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000)
+TEST_SIG (S, F, 1, cbrt, -10.0, 10.0)
+TEST_ULP (cbrtf, 1.03)
+TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000)
diff --git a/pl/math/cbrtf_data.c b/math/aarch64/experimental/cbrtf_data.c
index c6cdb4de0d65..7b5c53f4a606 100644
--- a/pl/math/cbrtf_data.c
+++ b/math/aarch64/experimental/cbrtf_data.c
@@ -1,7 +1,7 @@
/*
* Coefficients and table entries for single-precision cbrt(x).
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/cosh_2u.c b/math/aarch64/experimental/cosh_2u.c
index 2240a9c56f15..f5bc73b85df8 100644
--- a/pl/math/cosh_2u.c
+++ b/math/aarch64/experimental/cosh_2u.c
@@ -1,21 +1,19 @@
/*
* Double-precision cosh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "exp_inline.h"
#define AbsMask 0x7fffffffffffffff
-#define SpecialBound \
+#define SpecialBound \
0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows. */
-double
-__exp_dd (double, double);
-
static double
specialcase (double x, uint64_t iax)
{
@@ -23,9 +21,9 @@ specialcase (double x, uint64_t iax)
return INFINITY;
if (iax > 0x7ff0000000000000)
return __math_invalid (x);
- /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by
- exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */
- double t = __exp_dd (asdouble (iax) / 2, 0);
+ /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated
+ by exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */
+ double t = exp_inline (asdouble (iax) / 2, 0);
return (0.5 * t) * t;
}
@@ -44,20 +42,20 @@ cosh (double x)
uint64_t ix = asuint64 (x);
uint64_t iax = ix & AbsMask;
- /* exp overflows a little bit before cosh, so use special-case handler for the
- gap, as well as special values. */
+ /* exp overflows a little bit before cosh, so use special-case handler for
+ the gap, as well as special values. */
if (unlikely (iax >= SpecialBound))
return specialcase (x, iax);
double ax = asdouble (iax);
/* Use double-precision exp helper to calculate exp(x), then:
cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2). */
- double t = __exp_dd (ax, 0);
+ double t = exp_inline (ax, 0);
return 0.5 * t + 0.5 / t;
}
-PL_SIG (S, D, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (cosh, 1.43)
-PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
-PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
-PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100)
+TEST_SIG (S, D, 1, cosh, -10.0, 10.0)
+TEST_ULP (cosh, 1.43)
+TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
+TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
+TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100)
diff --git a/pl/math/coshf_1u9.c b/math/aarch64/experimental/coshf_1u9.c
index cf737840e0d6..b7e7720a472e 100644
--- a/pl/math/coshf_1u9.c
+++ b/math/aarch64/experimental/coshf_1u9.c
@@ -1,22 +1,19 @@
/*
* Single-precision cosh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffff
#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */
-#define SpecialBound \
- 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \
- special case. */
-
-float
-optr_aor_exp_f32 (float);
+/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+#define SpecialBound 0x42ad496c
static NOINLINE float
specialcase (float x, uint32_t iax)
@@ -32,7 +29,7 @@ specialcase (float x, uint32_t iax)
without overflow, so use exp(|x|/2) instead. For large x cosh(x) is
dominated by exp(x), so return:
cosh(x) ~= (exp(|x|/2))^2 / 2. */
- float t = optr_aor_exp_f32 (asfloat (iax) / 2);
+ float t = expf (asfloat (iax) / 2);
return (0.5 * t) * t;
}
@@ -57,12 +54,12 @@ coshf (float x)
/* Compute cosh using the definition:
coshf(x) = exp(x) / 2 + exp(-x) / 2. */
- float t = optr_aor_exp_f32 (ax);
+ float t = expf (ax);
return 0.5f * t + 0.5f / t;
}
-PL_SIG (S, F, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (coshf, 1.89)
-PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100)
-PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
-PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
+TEST_SIG (S, F, 1, cosh, -10.0, 10.0)
+TEST_ULP (coshf, 1.89)
+TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100)
+TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
+TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/erf_2u5.c b/math/aarch64/experimental/erf_2u5.c
index 3ca2a1332c1f..0bbe3e9548f8 100644
--- a/pl/math/erf_2u5.c
+++ b/math/aarch64/experimental/erf_2u5.c
@@ -1,13 +1,13 @@
/*
* Double-precision erf(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
#define Shift 0x1p45
@@ -42,7 +42,7 @@
erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
want -0x1.20dd59132ebafp-8. */
double
-erf (double x)
+arm_math_erf (double x)
{
/* Get absolute value and sign. */
uint64_t ix = asuint64 (x);
@@ -62,8 +62,8 @@ erf (double x)
double r = z - Shift;
/* Lookup erf(r) and scale(r) in table.
Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */
- double erfr = __erf_data.tab[i].erf;
- double scale = __erf_data.tab[i].scale;
+ double erfr = __v_erf_data.tab[i].erf;
+ double scale = __v_erf_data.tab[i].scale;
/* erf(x) ~ erf(r) + scale * d * poly (d, r). */
double d = a - r;
@@ -95,8 +95,7 @@ erf (double x)
return asdouble (sign | asuint64 (1.0));
}
-PL_SIG (S, D, 1, erf, -6.0, 6.0)
-PL_TEST_ULP (erf, 1.79)
-PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000)
-PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000)
-PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000)
+TEST_ULP (arm_math_erf, 1.79)
+TEST_SYM_INTERVAL (arm_math_erf, 0, 5.9921875, 40000)
+TEST_SYM_INTERVAL (arm_math_erf, 5.9921875, inf, 40000)
+TEST_SYM_INTERVAL (arm_math_erf, 0, inf, 40000)
diff --git a/pl/math/erfc_1u8.c b/math/aarch64/experimental/erfc_1u8.c
index 7f2004e9335d..5357e9329433 100644
--- a/pl/math/erfc_1u8.c
+++ b/math/aarch64/experimental/erfc_1u8.c
@@ -1,13 +1,13 @@
/*
* Double-precision erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Shift 0x1p45
#define P20 0x1.5555555555555p-2 /* 1/3. */
@@ -86,11 +86,11 @@ erfc (double x)
/* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale
to 2/sqrt(pi), when x reduced to r = 0. */
double z = a + Shift;
- uint64_t i = asuint64 (z);
+ uint64_t i = asuint64 (z) - asuint64 (Shift);
double r = z - Shift;
/* These values are scaled by 2^128. */
- double erfcr = __erfc_data.tab[i].erfc;
- double scale = __erfc_data.tab[i].scale;
+ double erfcr = __v_erfc_data.tab[i].erfc;
+ double scale = __v_erfc_data.tab[i].scale;
/* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */
double d = a - r;
@@ -144,10 +144,10 @@ erfc (double x)
return __math_uflow (0);
}
-PL_SIG (S, D, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (erfc, 1.21)
-PL_TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000)
-PL_TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000)
-PL_TEST_INTERVAL (erfc, 28.0, inf, 40000)
-PL_TEST_INTERVAL (erfc, -6.0, -inf, 40000)
+TEST_SIG (S, D, 1, erfc, -6.0, 28.0)
+TEST_ULP (erfc, 1.21)
+TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000)
+TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000)
+TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000)
+TEST_INTERVAL (erfc, 28.0, inf, 40000)
+TEST_INTERVAL (erfc, -6.0, -inf, 40000)
diff --git a/pl/math/erfcf_1u7.c b/math/aarch64/experimental/erfcf_1u7.c
index c8ce95cca058..e56193c8a103 100644
--- a/pl/math/erfcf_1u7.c
+++ b/math/aarch64/experimental/erfcf_1u7.c
@@ -1,13 +1,13 @@
/*
* Single-precision erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Shift 0x1p17f
#define OneThird 0x1.555556p-2f
@@ -59,8 +59,8 @@ erfcf (float x)
float r = z - Shift;
/* These values are scaled by 2^-47. */
- float erfcr = __erfcf_data.tab[i].erfc;
- float scale = __erfcf_data.tab[i].scale;
+ float erfcr = __v_erfcf_data.tab[i].erfc;
+ float scale = __v_erfcf_data.tab[i].scale;
/* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */
float d = a - r;
@@ -94,10 +94,10 @@ erfcf (float x)
return sign ? 2.0f : __math_uflowf (0);
}
-PL_SIG (S, F, 1, erfc, -4.0, 10.0)
-PL_TEST_ULP (erfcf, 1.14)
-PL_TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000)
-PL_TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000)
-PL_TEST_INTERVAL (erfcf, 10.0625, inf, 40000)
-PL_TEST_INTERVAL (erfcf, -4.0, -inf, 40000)
+TEST_SIG (S, F, 1, erfc, -4.0, 10.0)
+TEST_ULP (erfcf, 1.14)
+TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000)
+TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000)
+TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000)
+TEST_INTERVAL (erfcf, 10.0625, inf, 40000)
+TEST_INTERVAL (erfcf, -4.0, -inf, 40000)
diff --git a/pl/math/erff_2u.c b/math/aarch64/experimental/erff_2u.c
index f43e647072f8..9487f60dd1e3 100644
--- a/pl/math/erff_2u.c
+++ b/math/aarch64/experimental/erff_2u.c
@@ -1,13 +1,13 @@
/*
* Single-precision erf(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
#define Shift 0x1p16f
@@ -37,7 +37,7 @@
erff(0x1.c373e6p-9) got 0x1.fd686cp-9
want 0x1.fd6868p-9. */
float
-erff (float x)
+arm_math_erff (float x)
{
/* Get absolute value and sign. */
uint32_t ix = asuint (x);
@@ -56,8 +56,8 @@ erff (float x)
float z = a + Shift;
uint32_t i = asuint (z) - asuint (Shift);
float r = z - Shift;
- float erfr = __erff_data.tab[i].erf;
- float scale = __erff_data.tab[i].scale;
+ float erfr = __v_erff_data.tab[i].erf;
+ float scale = __v_erff_data.tab[i].scale;
/* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */
float d = a - r;
@@ -75,8 +75,7 @@ erff (float x)
return asfloat (sign | asuint (1.0f));
}
-PL_SIG (S, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (erff, 1.43)
-PL_TEST_SYM_INTERVAL (erff, 0, 3.9375, 40000)
-PL_TEST_SYM_INTERVAL (erff, 3.9375, inf, 40000)
-PL_TEST_SYM_INTERVAL (erff, 0, inf, 40000)
+TEST_ULP (arm_math_erff, 1.43)
+TEST_SYM_INTERVAL (arm_math_erff, 0, 3.9375, 40000)
+TEST_SYM_INTERVAL (arm_math_erff, 3.9375, inf, 40000)
+TEST_SYM_INTERVAL (arm_math_erff, 0, inf, 40000)
diff --git a/pl/math/erfinv_24u5.c b/math/aarch64/experimental/erfinv_24u5.c
index 20e1e361befc..753f38a79f66 100644
--- a/pl/math/erfinv_24u5.c
+++ b/math/aarch64/experimental/erfinv_24u5.c
@@ -1,14 +1,13 @@
/*
* Double-precision inverse error function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "poly_scalar_f64.h"
-#include "pl_sig.h"
-#define IGNORE_SCALAR_FENV
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
const static struct
{
@@ -75,7 +74,12 @@ erfinv (double x)
/ (copysign (t, x) * horner_9_f64 (t, data.Q_57));
}
-PL_SIG (S, D, 1, erfinv, -0.99, 0.99)
-PL_TEST_ULP (erfinv, 24.0)
-PL_TEST_INTERVAL (erfinv, 0, 1, 40000)
-PL_TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000)
+#if USE_MPFR
+# warning Not generating tests for erfinv, as MPFR has no suitable reference
+#else
+TEST_DISABLE_FENV (erfinv)
+TEST_SIG (S, D, 1, erfinv, -0.99, 0.99)
+TEST_ULP (erfinv, 24.0)
+TEST_INTERVAL (erfinv, 0, 1, 40000)
+TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000)
+#endif
diff --git a/pl/math/erfinvf_4u7.c b/math/aarch64/experimental/erfinvf_4u7.c
index 40736da08be8..152994f6336a 100644
--- a/pl/math/erfinvf_4u7.c
+++ b/math/aarch64/experimental/erfinvf_4u7.c
@@ -1,13 +1,13 @@
/*
* Single-precision inverse error function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
const static struct
{
@@ -69,6 +69,10 @@ erfinvf (float x)
/ (copysignf (t, x) * horner_2_f32 (t, data.Q_50));
}
-PL_SIG (S, F, 1, erfinv, -0.99, 0.99)
-PL_TEST_ULP (erfinvf, 4.09)
-PL_TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000)
+#if USE_MPFR
+# warning Not generating tests for erfinvf, as MPFR has no suitable reference
+#else
+TEST_SIG (S, F, 1, erfinv, -0.99, 0.99)
+TEST_ULP (erfinvf, 4.09)
+TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000)
+#endif
diff --git a/pl/math/erfinvl.c b/math/aarch64/experimental/erfinvl.c
index ea4aadfccd00..4d91410f1a5c 100644
--- a/pl/math/erfinvl.c
+++ b/math/aarch64/experimental/erfinvl.c
@@ -1,7 +1,7 @@
/*
* Extended precision inverse error function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define _GNU_SOURCE
diff --git a/pl/math/exp.c b/math/aarch64/experimental/exp_inline.h
index 90253b68875d..1a327c1e67d3 100644
--- a/pl/math/exp.c
+++ b/math/aarch64/experimental/exp_inline.h
@@ -1,10 +1,13 @@
/*
* Double-precision e^x function.
*
- * Copyright (c) 2018-2023, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#ifndef PL_MATH_EXP_INLINE_H
+#define PL_MATH_EXP_INLINE_H
+
#include <float.h>
#include <math.h>
#include <stdint.h>
@@ -30,7 +33,7 @@
adjustment of scale, positive k here means the result may overflow and
negative k means the result may underflow. */
static inline double
-specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
+exp_inline_special_case (double_t tmp, uint64_t sbits, uint64_t ki)
{
double_t scale, y;
@@ -77,7 +80,7 @@ top12 (double x)
/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
If hastail is 0 then xtail is assumed to be 0 too. */
static inline double
-exp_inline (double x, double xtail, int hastail)
+exp_inline (double x, double xtail)
{
uint32_t abstop;
uint64_t ki, idx, top, sbits;
@@ -125,7 +128,7 @@ exp_inline (double x, double xtail, int hastail)
#endif
r = x + kd * NegLn2hiN + kd * NegLn2loN;
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
- if (hastail)
+ if (!__builtin_constant_p (xtail) || xtail != 0.0)
r += xtail;
/* 2^(k/N) ~= scale * (1 + tail). */
idx = 2 * (ki % N);
@@ -146,18 +149,11 @@ exp_inline (double x, double xtail, int hastail)
tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6);
#endif
if (unlikely (abstop == 0))
- return specialcase (tmp, sbits, ki);
+ return exp_inline_special_case (tmp, sbits, ki);
scale = asdouble (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
is no spurious underflow here even without fma. */
return eval_as_double (scale + scale * tmp);
}
-/* May be useful for implementing pow where more than double
- precision input is needed. */
-double
-__exp_dd (double x, double xtail)
-{
- return exp_inline (x, xtail, 1);
-}
-
+#endif
diff --git a/pl/math/expf_data.c b/math/aarch64/experimental/expf_data.c
index 474ad57a29a0..958f705cc676 100644
--- a/pl/math/expf_data.c
+++ b/math/aarch64/experimental/expf_data.c
@@ -2,7 +2,7 @@
* Coeffs and table entries for single-precision exp. Copied from
* math/exp2f_data.c, with EXP2F_TABLE_BITS == 32.
*
- * Copyright (c) 2017-2023, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -13,7 +13,7 @@
const struct expf_data __expf_data = {
/* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
used for computing 2^(k/N) for an int |k| < 150 N as
- double(tab[k%N] + (k << 52-BITS)) */
+ double(tab[k%N] + (k << 52-BITS)). */
.tab = {
0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
diff --git a/pl/math/expm1_2u5.c b/math/aarch64/experimental/expm1_2u5.c
index f7d431198614..a4805e832af3 100644
--- a/pl/math/expm1_2u5.c
+++ b/math/aarch64/experimental/expm1_2u5.c
@@ -1,14 +1,14 @@
/*
* Double-precision e^x - 1 function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f64.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define InvLn2 0x1.71547652b82fep0
#define Ln2hi 0x1.62e42fefa39efp-1
@@ -76,10 +76,10 @@ expm1 (double x)
return 2 * fma (p, t, t - 0.5);
}
-PL_SIG (S, D, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (expm1, 1.68)
-PL_TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000)
-PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000)
-PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
-PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100)
-PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100)
+TEST_SIG (S, D, 1, expm1, -9.9, 9.9)
+TEST_ULP (expm1, 1.68)
+TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000)
+TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000)
+TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
+TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100)
+TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100)
diff --git a/math/aarch64/experimental/expm1_data.c b/math/aarch64/experimental/expm1_data.c
new file mode 100644
index 000000000000..955895056924
--- /dev/null
+++ b/math/aarch64/experimental/expm1_data.c
@@ -0,0 +1,21 @@
+/*
+ * Coefficients for double-precision e^x - 1 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Generated using fpminimax, see tools/expm1.sollya for details. */
+const double __expm1_poly[] = { 0x1p-1,
+ 0x1.5555555555559p-3,
+ 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7,
+ 0x1.6c16c16c1b5f3p-10,
+ 0x1.a01a01affa35dp-13,
+ 0x1.a01a018b4ecbbp-16,
+ 0x1.71ddf82db5bb4p-19,
+ 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26,
+ 0x1.1f143d060a28ap-29 };
diff --git a/pl/math/expm1f_1u6.c b/math/aarch64/experimental/expm1f_1u6.c
index e12c9ba9a8a2..03d1e9dc31ef 100644
--- a/pl/math/expm1f_1u6.c
+++ b/math/aarch64/experimental/expm1f_1u6.c
@@ -1,23 +1,23 @@
/*
* Single-precision e^x - 1 function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Shift (0x1.8p23f)
#define InvLn2 (0x1.715476p+0f)
#define Ln2hi (0x1.62e4p-1f)
#define Ln2lo (0x1.7f7d1cp-20f)
#define AbsMask (0x7fffffff)
-#define InfLimit \
+#define InfLimit \
(0x1.644716p6) /* Smallest value of x for which expm1(x) overflows. */
-#define NegLimit \
+#define NegLimit \
(-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */
/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
@@ -70,10 +70,10 @@ expm1f (float x)
return 2 * fmaf (p, t, t - 0.5f);
}
-PL_SIG (S, F, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (expm1f, 1.02)
-PL_TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000)
-PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000)
-PL_TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000)
-PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000)
-PL_TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000)
+TEST_SIG (S, F, 1, expm1, -9.9, 9.9)
+TEST_ULP (expm1f, 1.02)
+TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000)
+TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000)
+TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000)
+TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000)
+TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000)
diff --git a/pl/math/expm1f_data.c b/math/aarch64/experimental/expm1f_data.c
index 9d02dc448ebb..92d9189ff503 100644
--- a/pl/math/expm1f_data.c
+++ b/math/aarch64/experimental/expm1f_data.c
@@ -1,12 +1,12 @@
/*
* Coefficients for single-precision e^x - 1 function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
/* Generated using fpminimax, see tools/expm1f.sollya for details. */
-const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5,
- 0x1.12287cp-7, 0x1.6b55a2p-10};
+const float __expm1f_poly[] = { 0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5,
+ 0x1.12287cp-7, 0x1.6b55a2p-10 };
diff --git a/pl/math/log10_2u.c b/math/aarch64/experimental/log10_2u.c
index 74828ea9ef3c..84ee1544fe1a 100644
--- a/pl/math/log10_2u.c
+++ b/math/aarch64/experimental/log10_2u.c
@@ -1,13 +1,13 @@
/*
* Double-precision log10(x) function.
*
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Polynomial coefficients and lookup tables. */
#define T __log10_data.tab
@@ -32,11 +32,11 @@ top16 (double x)
/* Fast and low accuracy implementation of log10.
The implementation is similar to that of math/log, except that:
- Polynomials are computed for log10(1+r) with r on same intervals as log.
- - Lookup parameters are scaled (at runtime) to switch from base e to base 10.
- Many errors above 1.59 ulp are observed across the whole range of doubles.
- The greatest observed error is 1.61 ulp, at around 0.965:
- log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6
- want -0x1.fee26884905a8p-6. */
+ - Lookup parameters are scaled (at runtime) to switch from base e to
+ base 10. Many errors above 1.59 ulp are observed across the whole range of
+ doubles. The greatest observed error is 1.61 ulp, at around 0.965:
+ log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6
+ want -0x1.fee26884905a8p-6. */
double
log10 (double x)
{
@@ -61,8 +61,8 @@ log10 (double x)
y = r3
* (B[1] + r * B[2] + r2 * B[3]
+ r3
- * (B[4] + r * B[5] + r2 * B[6]
- + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
+ * (B[4] + r * B[5] + r2 * B[6]
+ + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
/* Worst-case error is around 0.507 ULP. */
w = r * 0x1p27;
double_t rhi = r + w - w;
@@ -123,7 +123,8 @@ log10 (double x)
r2 = r * r; /* rounding error: 0x1p-54/N^2. */
/* Scale by 1/ln(10). Polynomial already contains scaling. */
- y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
+ y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4]))
+ + hi;
y = y * InvLn10;
return eval_as_double (y);
@@ -143,8 +144,8 @@ log10l (long double x)
#endif
// clang-format on
-PL_SIG (S, D, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (log10, 1.11)
-PL_TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000)
-PL_TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000)
-PL_TEST_INTERVAL (log10, 0, inf, 40000)
+TEST_SIG (S, D, 1, log10, 0.01, 11.1)
+TEST_ULP (log10, 1.11)
+TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000)
+TEST_INTERVAL (log10, 0, inf, 40000)
diff --git a/pl/math/log10_data.c b/math/aarch64/experimental/log10_data.c
index 9976f19cd6df..20b5ef883ed8 100644
--- a/pl/math/log10_data.c
+++ b/math/aarch64/experimental/log10_data.c
@@ -1,7 +1,7 @@
/*
* Data for log10.
*
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -333,5 +333,5 @@ that logc + poly(z/c - 1) has small error, however near x == 1 when
{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
#endif
},
-#endif /* !HAVE_FAST_FMA */
+#endif /* !HAVE_FAST_FMA. */
};
diff --git a/pl/math/log1p_2u.c b/math/aarch64/experimental/log1p_2u.c
index f9491ce52b44..a1ff309ecb5f 100644
--- a/pl/math/log1p_2u.c
+++ b/math/aarch64/experimental/log1p_2u.c
@@ -1,19 +1,19 @@
/*
* Double-precision log(1+x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f64.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Ln2Hi 0x1.62e42fefa3800p-1
#define Ln2Lo 0x1.ef35793c76730p-45
#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */
-#define OneMHfRt2Top \
+#define OneMHfRt2Top \
0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */
#define OneTop12 0x3ff
#define BottomMask 0xffffffff
@@ -123,9 +123,9 @@ log1p (double x)
return y + fma (Ln2Hi, kd, p);
}
-PL_SIG (S, D, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (log1p, 1.26)
-PL_TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
-PL_TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
-PL_TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000)
-PL_TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000)
+TEST_SIG (S, D, 1, log1p, -0.9, 10.0)
+TEST_ULP (log1p, 1.26)
+TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000)
+TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000)
diff --git a/math/aarch64/experimental/log1p_data.c b/math/aarch64/experimental/log1p_data.c
new file mode 100644
index 000000000000..91a7196d795f
--- /dev/null
+++ b/math/aarch64/experimental/log1p_data.c
@@ -0,0 +1,20 @@
+/*
+ * Data used in double-precision log(1+x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Polynomial coefficients generated using Remez algorithm, see
+ log1p.sollya for details. */
+const struct log1p_data __log1p_data
+ = { .coeffs
+ = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+ 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+ -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+ 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+ -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+ 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+ -0x1.cfa7385bdb37ep-6 } };
diff --git a/pl/math/log1pf_2u1.c b/math/aarch64/experimental/log1pf_2u1.c
index e99174853720..fe4f93865220 100644
--- a/pl/math/log1pf_2u1.c
+++ b/math/aarch64/experimental/log1pf_2u1.c
@@ -1,14 +1,14 @@
/*
* Single-precision log(1+x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "poly_scalar_f32.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define Ln2 (0x1.62e43p-1f)
#define SignMask (0x80000000)
@@ -153,9 +153,9 @@ log1pf (float x)
return fmaf (scale_back, Ln2, p);
}
-PL_SIG (S, F, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (log1pf, 1.52)
-PL_TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
-PL_TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
-PL_TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000)
-PL_TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000)
+TEST_SIG (S, F, 1, log1p, -0.9, 10.0)
+TEST_ULP (log1pf, 1.52)
+TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000)
+TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000)
diff --git a/pl/math/log1pf_data.c b/math/aarch64/experimental/log1pf_data.c
index 8c92d5738fe8..e0ac269a1069 100644
--- a/pl/math/log1pf_data.c
+++ b/math/aarch64/experimental/log1pf_data.c
@@ -1,7 +1,7 @@
/*
* Data used in single-precision log1p(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
@@ -9,6 +9,6 @@
/* Polynomial coefficients generated using floating-point minimax
algorithm, see tools/log1pf.sollya for details. */
const struct log1pf_data __log1pf_data
- = {.coeffs = {-0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
- -0x1.6f0d5ep-5f}};
+ = { .coeffs = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+ -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
+ 0x1.abcb6p-4f, -0x1.6f0d5ep-5f } };
diff --git a/pl/math/sinh_3u.c b/math/aarch64/experimental/sinh_3u.c
index 1d86629ee2a3..39030d2750a9 100644
--- a/pl/math/sinh_3u.c
+++ b/math/aarch64/experimental/sinh_3u.c
@@ -1,22 +1,19 @@
/*
* Double-precision sinh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "exp_inline.h"
#define AbsMask 0x7fffffffffffffff
#define Half 0x3fe0000000000000
-#define OFlowBound \
- 0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results \
- in NaN. */
-
-double
-__exp_dd (double, double);
+/* 0x1.62e42fefa39fp+9, above which using expm1 results in NaN. */
+#define OFlowBound 0x40862e42fefa39f0
/* Approximation for double-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
@@ -44,7 +41,7 @@ sinh (double x)
either. We use the identity: exp(a) = (exp(a / 2)) ^ 2
to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0
~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. */
- double e = __exp_dd (ax / 2, 0);
+ double e = exp_inline (ax / 2, 0);
return (e * halfsign) * e;
}
@@ -56,8 +53,8 @@ sinh (double x)
return (t + t / (t + 1)) * halfsign;
}
-PL_SIG (S, D, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (sinh, 2.08)
-PL_TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100)
-PL_TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
-PL_TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
+TEST_SIG (S, D, 1, sinh, -10.0, 10.0)
+TEST_ULP (sinh, 2.08)
+TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100)
+TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
+TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
diff --git a/pl/math/sinhf_2u3.c b/math/aarch64/experimental/sinhf_2u3.c
index aa7aadcf67c5..860ddc0fc83c 100644
--- a/pl/math/sinhf_2u3.c
+++ b/math/aarch64/experimental/sinhf_2u3.c
@@ -1,25 +1,21 @@
/*
* Single-precision sinh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffff
#define Half 0x3f000000
-#define Expm1OFlowLimit \
- 0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f \
- overflows. */
-#define OFlowLimit \
- 0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should \
- overflow. */
-
-float
-optr_aor_exp_f32 (float);
+/* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f overflows. */
+#define Expm1OFlowLimit 0x42b17218
+/* 0x1.65a9fap+6, minimum positive value for which sinhf should overflow. */
+#define OFlowLimit 0x42b2d4fd
/* Approximation for single-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
@@ -54,7 +50,7 @@ sinhf (float x)
~= (exp(|x| / 2)) ^ 2 / -2 for x < 0.
Greatest error in this region is 1.89 ULP:
sinhf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. */
- float e = optr_aor_exp_f32 (ax / 2);
+ float e = expf (ax / 2);
return (e * halfsign) * e;
}
@@ -66,8 +62,8 @@ sinhf (float x)
return (t + t / (t + 1)) * halfsign;
}
-PL_SIG (S, F, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (sinhf, 1.76)
-PL_TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
-PL_TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
-PL_TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
+TEST_SIG (S, F, 1, sinh, -10.0, 10.0)
+TEST_ULP (sinhf, 1.76)
+TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
+TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
+TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
diff --git a/math/aarch64/experimental/sve/erfinv_25u.c b/math/aarch64/experimental/sve/erfinv_25u.c
new file mode 100644
index 000000000000..4de6d08ab80f
--- /dev/null
+++ b/math/aarch64/experimental/sve/erfinv_25u.c
@@ -0,0 +1,156 @@
+/*
+ * Double-precision inverse error function (SVE variant).
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "sv_math.h"
+#include "test_defs.h"
+#include "math_config.h"
+#include "test_sig.h"
+#include "sv_poly_f64.h"
+#define SV_LOG_INLINE_POLY_ORDER 4
+#include "sv_log_inline.h"
+
+const static struct data
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
+ coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
+ of the denominator. P is interleaved P_17 and P_37, similar for Q. */
+ double P[7][2], Q[7][2];
+ double P_57[9], Q_57[9], tailshift, P37_0;
+ struct sv_log_inline_data log_tbl;
+} data = {
+ .P37_0 = -0x1.f3596123109edp-7,
+ .tailshift = -0.87890625,
+ .P = { { 0x1.007ce8f01b2e8p+4, 0x1.60b8fe375999ep-2 },
+ { -0x1.6b23cc5c6c6d7p+6, -0x1.779bb9bef7c0fp+1 },
+ { 0x1.74e5f6ceb3548p+7, 0x1.786ea384470a2p+3 },
+ { -0x1.5200bb15cc6bbp+7, -0x1.6a7c1453c85d3p+4 },
+ { 0x1.05d193233a849p+6, 0x1.31f0fc5613142p+4 },
+ { -0x1.148c5474ee5e1p+3, -0x1.5ea6c007d4dbbp+2 },
+ { 0x1.689181bbafd0cp-3, 0x1.e66f265ce9e5p-3 } },
+ .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 },
+ { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 },
+ { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 },
+ { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 },
+ { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 },
+ { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 },
+ { 0x1p+0, -0x1.4075c56404eecp+3 } },
+ .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2,
+ 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3,
+ 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 },
+ .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2,
+ 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3,
+ 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2 },
+ .log_tbl = SV_LOG_CONSTANTS
+};
+
+static inline svfloat64_t
+special (svbool_t pg, svfloat64_t x, const struct data *d)
+{
+ /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf.
+ By using log here, instead of log1p, we return finite values for both
+ these inputs, and values outside [-1, 1]. This is non-compliant, but is an
+ acceptable optimisation at Ofast. To get correct behaviour for all finite
+ values use the log1p_inline helper on -abs(x) - note that erfinv(inf)
+ will still be finite. */
+ svfloat64_t ax = svabs_x (pg, x);
+ svfloat64_t t
+ = svneg_x (pg, sv_log_inline (pg, svsubr_x (pg, ax, 1), &d->log_tbl));
+ t = svdivr_x (pg, svsqrt_x (pg, t), 1);
+ svuint64_t sign
+ = sveor_x (pg, svreinterpret_u64 (ax), svreinterpret_u64 (x));
+ svfloat64_t ts
+ = svreinterpret_f64 (svorr_x (pg, sign, svreinterpret_u64 (t)));
+
+ svfloat64_t q = svadd_x (pg, t, d->Q_57[8]);
+ for (int i = 7; i >= 0; i--)
+ q = svmad_x (pg, q, t, d->Q_57[i]);
+
+ return svdiv_x (pg, sv_horner_8_f64_x (pg, t, d->P_57), svmul_x (pg, ts, q));
+}
+
+static inline svfloat64_t
+lookup (const double *c, svuint64_t idx)
+{
+ svfloat64_t x = svld1rq_f64 (svptrue_b64 (), c);
+ return svtbl (x, idx);
+}
+
+static inline svfloat64_t
+notails (svbool_t pg, svfloat64_t x, const struct data *d)
+{
+ svfloat64_t t = svmad_x (pg, x, x, -0.5625);
+ svfloat64_t p = svmla_x (pg, sv_f64 (d->P[5][0]), t, d->P[6][0]);
+ svfloat64_t q = svadd_x (pg, t, d->Q[5][0]);
+ for (int i = 4; i >= 0; i--)
+ {
+ p = svmad_x (pg, t, p, d->P[i][0]);
+ q = svmad_x (pg, t, q, d->Q[i][0]);
+ }
+ p = svmul_x (pg, p, x);
+ return svdiv_x (pg, p, q);
+}
+
+/* Vector implementation of Blair et al's rational approximation to inverse
+ error function in double precision. Largest observed error is 24.75 ULP:
+ _ZGVsMxv_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0
+ want 0x1.ea0547268660cp+0. */
+svfloat64_t SV_NAME_D1 (erfinv) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* Calculate inverse error using algorithm described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error function",
+ Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7.
+
+ Algorithm has 3 intervals:
+ - 'Normal' region [-0.75, 0.75]
+ - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
+ - Extreme tail [-1, -0.9375] U [0.9375, 1]
+ Normal and tail are both rational approximation of similar order on
+ shifted input - these are typically performed in parallel using gather
+ loads to obtain correct coefficients depending on interval. */
+
+ svbool_t no_tail = svacle (pg, x, 0.75);
+ if (unlikely (!svptest_any (pg, svnot_z (pg, no_tail))))
+ return notails (pg, x, d);
+
+ svbool_t is_tail = svnot_z (pg, no_tail);
+ svbool_t extreme_tail = svacgt (pg, x, 0.9375);
+ svuint64_t idx = svdup_n_u64_z (is_tail, 1);
+
+ svfloat64_t t = svsel_f64 (is_tail, sv_f64 (d->tailshift), sv_f64 (-0.5625));
+ t = svmla_x (pg, t, x, x);
+
+ svfloat64_t p = lookup (&d->P[6][0], idx);
+ svfloat64_t q
+ = svmla_x (pg, lookup (&d->Q[6][0], idx), svdup_n_f64_z (is_tail, 1), t);
+ for (int i = 5; i >= 0; i--)
+ {
+ p = svmla_x (pg, lookup (&d->P[i][0], idx), p, t);
+ q = svmla_x (pg, lookup (&d->Q[i][0], idx), q, t);
+ }
+ p = svmad_m (is_tail, p, t, d->P37_0);
+ p = svmul_x (pg, p, x);
+
+ if (likely (svptest_any (pg, extreme_tail)))
+ return svsel (extreme_tail, special (pg, x, d), svdiv_x (pg, p, q));
+ return svdiv_x (pg, p, q);
+}
+
+#if USE_MPFR
+# warning Not generating tests for _ZGVsMxv_erfinv, as MPFR has no suitable reference
+#else
+TEST_SIG (SV, D, 1, erfinv, -0.99, 0.99)
+TEST_ULP (SV_NAME_D1 (erfinv), 24.5)
+TEST_DISABLE_FENV (SV_NAME_D1 (erfinv))
+/* Test with control lane in each interval. */
+TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 100000)
+TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5)
+TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8)
+TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95)
+#endif
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/experimental/sve/erfinvf_5u.c b/math/aarch64/experimental/sve/erfinvf_5u.c
new file mode 100644
index 000000000000..2c81c4e0b9a2
--- /dev/null
+++ b/math/aarch64/experimental/sve/erfinvf_5u.c
@@ -0,0 +1,156 @@
+/*
+ * Single-precision inverse error function (SVE variant).
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
+#include "sv_logf_inline.h"
+
+const static struct data
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N
+ is the coeffs of the numerator in table N of Blair et al, and
+ Q_N is the coeffs of the denominator. Coefficients stored in
+ interleaved format to support lookup scheme. */
+ float P10_2, P29_3, Q10_2, Q29_2;
+ float P10_0, P29_1, P10_1, P29_2;
+ float Q10_0, Q29_0, Q10_1, Q29_1;
+ float P29_0, P_50[6], Q_50[2], tailshift;
+ struct sv_logf_data logf_tbl;
+} data = { .P10_0 = -0x1.a31268p+3,
+ .P10_1 = 0x1.ac9048p+4,
+ .P10_2 = -0x1.293ff6p+3,
+ .P29_0 = -0x1.fc0252p-4,
+ .P29_1 = 0x1.119d44p+0,
+ .P29_2 = -0x1.f59ee2p+0,
+ .P29_3 = 0x1.b13626p-2,
+ .Q10_0 = -0x1.8265eep+3,
+ .Q10_1 = 0x1.ef5eaep+4,
+ .Q10_2 = -0x1.12665p+4,
+ .Q29_0 = -0x1.69952p-4,
+ .Q29_1 = 0x1.c7b7d2p-1,
+ .Q29_2 = -0x1.167d7p+1,
+ .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1,
+ -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 },
+ .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0 },
+ .tailshift = -0.87890625,
+ .logf_tbl = SV_LOGF_CONSTANTS };
+
+static inline svfloat32_t
+special (svbool_t pg, svfloat32_t x, const struct data *d)
+{
+ svfloat32_t ax = svabs_x (pg, x);
+ svfloat32_t t = svdivr_x (
+ pg,
+ svsqrt_x (pg, svneg_x (pg, sv_logf_inline (pg, svsubr_x (pg, ax, 1),
+ &d->logf_tbl))),
+ 1);
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (ax), svreinterpret_u32 (x));
+ svfloat32_t ts
+ = svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (t)));
+ svfloat32_t q
+ = svmla_x (pg, sv_f32 (d->Q_50[0]), svadd_x (pg, t, d->Q_50[1]), t);
+ return svdiv_x (pg, sv_horner_5_f32_x (pg, t, d->P_50), svmul_x (pg, ts, q));
+}
+
+static inline svfloat32_t
+notails (svbool_t pg, svfloat32_t x, const struct data *d)
+{
+ /* Shortcut when no input is in a tail region - no need to gather shift or
+ coefficients. */
+ svfloat32_t t = svmad_x (pg, x, x, -0.5625);
+ svfloat32_t q = svadd_x (pg, t, d->Q10_2);
+ q = svmad_x (pg, t, q, d->Q10_1);
+ q = svmad_x (pg, t, q, d->Q10_0);
+
+ svfloat32_t p = svmla_x (pg, sv_f32 (d->P10_1), t, d->P10_2);
+ p = svmad_x (pg, p, t, d->P10_0);
+
+ return svdiv_x (pg, svmul_x (pg, x, p), q);
+}
+
+/* Vector implementation of Blair et al's rational approximation to inverse
+ error function in single-precision. Worst-case error is 4.71 ULP, in the
+ tail region:
+ _ZGVsMxv_erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0
+ want 0x1.b83274p+0. */
+svfloat32_t SV_NAME_F1 (erfinv) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Calculate inverse error using algorithm described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error function",
+ Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7. */
+
+ /* Algorithm has 3 intervals:
+ - 'Normal' region [-0.75, 0.75]
+ - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
+ - Extreme tail [-1, -0.9375] U [0.9375, 1]
+ Normal and tail are both rational approximation of similar order on
+ shifted input - these are typically performed in parallel using gather
+ loads to obtain correct coefficients depending on interval. */
+ svbool_t is_tail = svacge (pg, x, 0.75);
+ svbool_t extreme_tail = svacge (pg, x, 0.9375);
+
+ if (likely (!svptest_any (pg, is_tail)))
+ return notails (pg, x, d);
+
+ /* Select requisite shift depending on interval: polynomial is evaluated on
+ x * x - shift.
+ Normal shift = 0.5625
+ Tail shift = 0.87890625. */
+ svfloat32_t t = svmla_x (
+ pg, svsel (is_tail, sv_f32 (d->tailshift), sv_f32 (-0.5625)), x, x);
+
+ svuint32_t idx = svdup_u32_z (is_tail, 1);
+ svuint32_t idxhi = svadd_x (pg, idx, 2);
+
+ /* Load coeffs in quadwords and select them according to interval. */
+ svfloat32_t pqhi = svld1rq (svptrue_b32 (), &d->P10_2);
+ svfloat32_t plo = svld1rq (svptrue_b32 (), &d->P10_0);
+ svfloat32_t qlo = svld1rq (svptrue_b32 (), &d->Q10_0);
+
+ svfloat32_t p2 = svtbl (pqhi, idx);
+ svfloat32_t p1 = svtbl (plo, idxhi);
+ svfloat32_t p0 = svtbl (plo, idx);
+ svfloat32_t q0 = svtbl (qlo, idx);
+ svfloat32_t q1 = svtbl (qlo, idxhi);
+ svfloat32_t q2 = svtbl (pqhi, idxhi);
+
+ svfloat32_t p = svmla_x (pg, p1, p2, t);
+ p = svmla_x (pg, p0, p, t);
+ /* Tail polynomial has higher order - merge with normal lanes. */
+ p = svmad_m (is_tail, p, t, d->P29_0);
+ svfloat32_t y = svmul_x (pg, x, p);
+
+ /* Least significant term of both Q polynomials is 1, so no need to generate
+ it. */
+ svfloat32_t q = svadd_x (pg, t, q2);
+ q = svmla_x (pg, q1, q, t);
+ q = svmla_x (pg, q0, q, t);
+
+ if (unlikely (svptest_any (pg, extreme_tail)))
+ return svsel (extreme_tail, special (extreme_tail, x, d),
+ svdiv_x (pg, y, q));
+ return svdiv_x (pg, y, q);
+}
+
+#if USE_MPFR
+# warning Not generating tests for _ZGVsMxv_erfinvf, as MPFR has no suitable reference
+#else
+TEST_SIG (SV, F, 1, erfinv, -0.99, 0.99)
+TEST_ULP (SV_NAME_F1 (erfinv), 4.09)
+TEST_DISABLE_FENV (SV_NAME_F1 (erfinv))
+TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 40000)
+TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5)
+TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8)
+TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95)
+#endif
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_powi.c b/math/aarch64/experimental/sve/powi.c
index e53bf2195533..62dd1b114970 100644
--- a/pl/math/sv_powi.c
+++ b/math/aarch64/experimental/sve/powi.c
@@ -1,7 +1,7 @@
/*
* Double-precision SVE powi(x, n) function.
*
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -46,3 +46,4 @@ _ZGVsMxvv_powk (svfloat64_t as, svint64_t ns, svbool_t p)
return acc;
}
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_powif.c b/math/aarch64/experimental/sve/powif.c
index 7e032fd86a20..fd74acf12df7 100644
--- a/pl/math/sv_powif.c
+++ b/math/aarch64/experimental/sve/powif.c
@@ -1,7 +1,7 @@
/*
* Single-precision SVE powi(x, n) function.
*
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -46,3 +46,4 @@ _ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p)
return acc;
}
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/experimental/sve/sv_logf_inline.h b/math/aarch64/experimental/sve/sv_logf_inline.h
new file mode 100644
index 000000000000..c317a23f6fc3
--- /dev/null
+++ b/math/aarch64/experimental/sve/sv_logf_inline.h
@@ -0,0 +1,51 @@
+/*
+ * Single-precision vector log function - inline version
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+
+struct sv_logf_data
+{
+ float p1, p3, p5, p6, p0, p2, p4;
+ float ln2;
+ uint32_t off, mantissa_mask;
+};
+
+#define SV_LOGF_CONSTANTS \
+ { \
+ .p0 = -0x1.ffffc8p-2f, .p1 = 0x1.555d7cp-2f, .p2 = -0x1.00187cp-2f, \
+ .p3 = 0x1.961348p-3f, .p4 = -0x1.4f9934p-3f, .p5 = 0x1.5a9aa2p-3f, \
+ .p6 = -0x1.3e737cp-3f, .ln2 = 0x1.62e43p-1f, .off = 0x3f2aaaab, \
+ .mantissa_mask = 0x007fffff \
+ }
+
+static inline svfloat32_t
+sv_logf_inline (svbool_t pg, svfloat32_t x, const struct sv_logf_data *d)
+{
+ svuint32_t u = svreinterpret_u32 (x);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u = svsub_x (pg, u, d->off);
+ svfloat32_t n = svcvt_f32_s32_x (
+ pg, svasr_x (pg, svreinterpret_s32_u32 (u), 23)); /* signextend. */
+ u = svand_x (pg, u, d->mantissa_mask);
+ u = svadd_x (pg, u, d->off);
+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
+
+ /* y = log(1+r) + n*ln2. */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ svfloat32_t p1356 = svld1rq_f32 (svptrue_b32 (), &d->p1);
+ svfloat32_t p = svmla_lane (sv_f32 (d->p4), r, p1356, 2);
+ svfloat32_t q = svmla_lane (sv_f32 (d->p2), r, p1356, 1);
+ svfloat32_t y = svmla_lane (sv_f32 (d->p0), r, p1356, 0);
+ p = svmla_lane (p, r2, p1356, 3);
+ q = svmla_x (pg, q, p, r2);
+ y = svmla_x (pg, y, q, r2);
+ p = svmla_x (pg, r, n, d->ln2);
+
+ return svmla_x (pg, p, y, r2);
+}
diff --git a/pl/math/tanf_3u3.c b/math/aarch64/experimental/tanf_3u3.c
index 30c86fa89730..c26e92db588f 100644
--- a/pl/math/tanf_3u3.c
+++ b/math/aarch64/experimental/tanf_3u3.c
@@ -1,12 +1,12 @@
/*
* Single-precision scalar tan(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "poly_scalar_f32.h"
/* Useful constants. */
@@ -52,14 +52,6 @@ reduce (float x, int32_t *in)
return r;
}
-/* Table with 4/PI to 192 bit precision. To avoid unaligned accesses
- only 8 new bits are added per entry, making the table 4 times larger. */
-static const uint32_t __inv_pio4[24]
- = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44,
- 0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1,
- 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62,
- 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041};
-
/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
@@ -130,11 +122,11 @@ tanf (float x)
return fmaf (x2, x * y, x);
}
/* Similar to other trigonometric routines, fast inaccurate reduction is
- performed for values of x from pi/4 up to RangeVal. In order to keep errors
- below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for
- other trigonometric routines. Above this value more advanced but slower
- reduction techniques need to be implemented to reach a similar accuracy.
- */
+ performed for values of x from pi/4 up to RangeVal. In order to keep
+ errors below 3.5ulps, we set the value of RangeVal to 2^17. This might
+ differ for other trigonometric routines. Above this value more advanced
+ but slower reduction techniques need to be implemented to reach a similar
+ accuracy. */
else if (ia12 < top12 (RangeVal))
{
/* Fast inaccurate reduction. */
@@ -182,12 +174,12 @@ tanf (float x)
return fmaf (scale, p, offset);
}
-PL_SIG (S, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (tanf, 2.80)
-PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
-PL_TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
-PL_TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
-PL_TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000)
-PL_TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000)
-PL_TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
-PL_TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000)
+TEST_SIG (S, F, 1, tan, -3.1, 3.1)
+TEST_ULP (tanf, 2.80)
+TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
+TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
+TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000)
+TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000)
+TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
+TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000)
diff --git a/pl/math/tanf_data.c b/math/aarch64/experimental/tanf_data.c
index a6b9d512eed2..f310cd77d4ec 100644
--- a/pl/math/tanf_data.c
+++ b/math/aarch64/experimental/tanf_data.c
@@ -1,7 +1,7 @@
/*
* Data used in single-precision tan(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/tanh_3u.c b/math/aarch64/experimental/tanh_3u.c
index 86f2904afc32..838b6c4f12c1 100644
--- a/pl/math/tanh_3u.c
+++ b/math/aarch64/experimental/tanh_3u.c
@@ -1,13 +1,13 @@
/*
* Double-precision tanh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include "poly_scalar_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define AbsMask 0x7fffffffffffffff
#define InvLn2 0x1.71547652b82fep0
@@ -15,8 +15,10 @@
#define Ln2lo 0x1.abc9e3b39803fp-56
#define Shift 0x1.8p52
-#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */
-#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */
+/* asuint64 (0x1.241bf835f9d5fp+4). */
+#define BoringBound 0x403241bf835f9d5f
+/* asuint64 (0x1p-27). */
+#define TinyBound 0x3e40000000000000
#define One 0x3ff0000000000000
static inline double
@@ -71,8 +73,8 @@ tanh (double x)
return q / (q + 2);
}
-PL_SIG (S, D, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (tanh, 2.27)
-PL_TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000)
-PL_TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000)
-PL_TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000)
+TEST_SIG (S, D, 1, tanh, -10.0, 10.0)
+TEST_ULP (tanh, 2.27)
+TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000)
+TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000)
+TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000)
diff --git a/pl/math/tanhf_2u6.c b/math/aarch64/experimental/tanhf_2u6.c
index 93ea3cf5d865..d9adae5c3a76 100644
--- a/pl/math/tanhf_2u6.c
+++ b/math/aarch64/experimental/tanhf_2u6.c
@@ -1,16 +1,15 @@
/*
* Single-precision tanh(x) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
-#define BoringBound \
- 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \
- negative). */
+/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
+#define BoringBound 0x41102cb3
#define AbsMask 0x7fffffff
#define One 0x3f800000
@@ -26,8 +25,8 @@ expm1f_inline (float x)
{
/* Helper routine for calculating exp(x) - 1.
Copied from expm1f_1u6.c, with several simplifications:
- - No special-case handling for tiny or special values, instead return early
- from the main routine.
+ - No special-case handling for tiny or special values, instead return
+ early from the main routine.
- No special handling for large values:
- No early return for infinity.
- Simpler combination of p and t in final stage of algorithm.
@@ -81,8 +80,8 @@ tanhf (float x)
return q / (q + 2);
}
-PL_SIG (S, F, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (tanhf, 2.09)
-PL_TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000)
-PL_TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
-PL_TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
+TEST_SIG (S, F, 1, tanh, -10.0, 10.0)
+TEST_ULP (tanhf, 2.09)
+TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
+TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
diff --git a/math/aarch64/sincospi_4u.c b/math/aarch64/sincospi_4u.c
new file mode 100644
index 000000000000..2a944bed23e1
--- /dev/null
+++ b/math/aarch64/sincospi_4u.c
@@ -0,0 +1,158 @@
+/*
+ * Double-precision scalar sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "math_config.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "poly_scalar_f64.h"
+
+/* Taylor series coefficents for sin(pi * x).
+ C2 coefficient (orginally ~=5.16771278) has been split into two parts:
+ C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278)
+ This change in magnitude reduces floating point rounding errors.
+ C2_hi is then reintroduced after the polynomial approxmation. */
+const static struct sincospi_data
+{
+ double poly[10];
+} sincospi_data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1,
+ -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
+ 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16,
+ 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 },
+};
+
+/* Top 12 bits of a double (sign and exponent bits). */
+static inline uint64_t
+abstop12 (double x)
+{
+ return (asuint64 (x) >> 52) & 0x7ff;
+}
+
+/* Triages special cases into 4 categories:
+ -1 or +1 if iy represents half an integer
+ -1 if round(y) is odd.
+ +1 if round(y) is even.
+ -2 or +2 if iy represents and integer.
+ -2 if iy is odd.
+ +2 if iy is even.
+ The argument is the bit representation of a positive non-zero
+ finite floating-point value which is either a half or an integer. */
+static inline int
+checkint (uint64_t iy)
+{
+ int e = iy >> 52;
+ if (e > 0x3ff + 52)
+ return 2;
+ if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
+ {
+ if ((iy - 1) & 2)
+ return -1;
+ else
+ return 1;
+ }
+ if (iy & (1 << (0x3ff + 52 - e)))
+ return -2;
+ return 2;
+}
+
+/* Approximation for scalar double-precision sincospi(x).
+ Maximum error for sin: 3.46 ULP:
+ sincospif_sin(0x1.3d8a067cd8961p+14) got 0x1.ffe609a279008p-1 want
+ 0x1.ffe609a27900cp-1.
+ Maximum error for cos: 3.66 ULP:
+ sincospif_cos(0x1.a0ec6997557eep-24) got 0x1.ffffffffffe59p-1 want
+ 0x1.ffffffffffe5dp-1. */
+void
+arm_math_sincospi (double x, double *out_sin, double *out_cos)
+{
+ const struct sincospi_data *d = ptr_barrier (&sincospi_data);
+ uint64_t sign = asuint64 (x) & 0x8000000000000000;
+
+ if (likely (abstop12 (x) < abstop12 (0x1p51)))
+ {
+ /* ax = |x| - n (range reduction into -1/2 .. 1/2). */
+ double ar_s = x - rint (x);
+
+ /* We know that cospi(x) = sinpi(0.5 - x)
+ range reduction and offset into sinpi range -1/2 .. 1/2
+ ax = 0.5 - |x - rint(x)|. */
+ double ar_c = 0.5 - fabs (ar_s);
+
+ /* ss = sin(pi * ax). */
+ double ar2_s = ar_s * ar_s;
+ double ar2_c = ar_c * ar_c;
+ double ar4_s = ar2_s * ar2_s;
+ double ar4_c = ar2_c * ar2_c;
+
+ uint64_t cc_sign = ((uint64_t) llrint (x)) << 63;
+ uint64_t ss_sign = cc_sign;
+ if (ar_s == 0)
+ ss_sign = sign;
+
+ double ss = pw_horner_9_f64 (ar2_s, ar4_s, d->poly);
+ double cc = pw_horner_9_f64 (ar2_c, ar4_c, d->poly);
+
+ /* As all values are reduced to -1/2 .. 1/2, the result of cos(x)
+ always be positive, therefore, the sign must be introduced
+ based upon if x rounds to odd or even. For sin(x) the sign is
+ copied from x. */
+ *out_sin
+ = asdouble (asuint64 (fma (-4 * ar2_s, ar_s, ss * ar_s)) ^ ss_sign);
+ *out_cos
+ = asdouble (asuint64 (fma (-4 * ar2_c, ar_c, cc * ar_c)) ^ cc_sign);
+ }
+ else
+ {
+ /* When abs(x) > 0x1p51, the x will be either
+ - Half integer (relevant if abs(x) in [0x1p51, 0x1p52])
+ - Odd integer (relevant if abs(x) in [0x1p52, 0x1p53])
+ - Even integer (relevant if abs(x) in [0x1p53, inf])
+ - Inf or NaN. */
+ if (abstop12 (x) >= 0x7ff)
+ {
+ double inv_result = __math_invalid (x);
+ *out_sin = inv_result;
+ *out_cos = inv_result;
+ return;
+ }
+ else
+ {
+ uint64_t ax = asuint64 (x) & 0x7fffffffffffffff;
+ int m = checkint (ax);
+ /* The case where ax is half integer. */
+ if (m & 1)
+ {
+ *out_sin = sign ? -m : m;
+ *out_cos = 0;
+ return;
+ }
+ /* The case where ax is integer. */
+ else
+ {
+ *out_sin = asdouble (sign);
+ *out_cos = m >> 1;
+ return;
+ }
+ }
+ }
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (arm_math_sincospi_sin)
+TEST_DISABLE_FENV (arm_math_sincospi_cos)
+TEST_ULP (arm_math_sincospi_sin, 2.96)
+TEST_ULP (arm_math_sincospi_cos, 3.16)
+# define SINCOS_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (arm_math_sincospi_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (arm_math_sincospi_cos, lo, hi, n)
+SINCOS_INTERVAL (0, 0x1p-63, 10000)
+SINCOS_INTERVAL (0x1p-63, 0.5, 50000)
+SINCOS_INTERVAL (0.5, 0x1p51, 50000)
+SINCOS_INTERVAL (0x1p51, inf, 10000)
+#endif
diff --git a/math/aarch64/sincospif_3u2.c b/math/aarch64/sincospif_3u2.c
new file mode 100644
index 000000000000..b79694d2ac65
--- /dev/null
+++ b/math/aarch64/sincospif_3u2.c
@@ -0,0 +1,145 @@
+/*
+ * Single-precision scalar sincospi function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "poly_scalar_f32.h"
+
+/* Taylor series coefficents for sin(pi * x). */
+const static struct sincospif_data
+{
+ float poly[6];
+} sincospif_data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f,
+ 0x1.50783p-4f, -0x1.e30750p-8f },
+};
+
+/* Top 12 bits of the float representation with the sign bit cleared. */
+static inline uint32_t
+abstop12 (float x)
+{
+ return (asuint (x) >> 20) & 0x7ff;
+}
+
+/* Triages special cases into 4 categories:
+ -1 or +1 if iy represents half an integer
+ -1 if round(y) is odd.
+ +1 if round(y) is even.
+ -2 or +2 if iy represents and integer.
+ -2 if iy is odd.
+ +2 if iy is even.
+ The argument is the bit representation of a positive non-zero
+ finite floating-point value which is either a half or an integer. */
+static inline int
+checkint (uint32_t iy)
+{
+ int e = iy >> 23;
+ if (e > 0x7f + 23)
+ return 2;
+ if (iy & ((1 << (0x7f + 23 - e)) - 1))
+ {
+ if ((iy - 1) & 2)
+ return -1;
+ else
+ return 1;
+ }
+ if (iy & (1 << (0x7f + 23 - e)))
+ return -2;
+ return 2;
+}
+
+/* Approximation for scalar single-precision sincospif(x).
+ Maximum error for sin: 3.04 ULP:
+ sincospif_sin(0x1.c597ccp-2) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1.
+ Maximum error for cos: 3.18 ULP:
+ sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. */
+void
+arm_math_sincospif (float x, float *out_sin, float *out_cos)
+{
+
+ const struct sincospif_data *d = ptr_barrier (&sincospif_data);
+ uint32_t sign = asuint (x) & 0x80000000;
+
+ /* abs(x) in [0, 0x1p22]. */
+ if (likely (abstop12 (x) < abstop12 (0x1p22)))
+ {
+ /* ar_s = x - n (range reduction into -1/2 .. 1/2). */
+ float ar_s = x - rintf (x);
+ /* We know that cospi(x) = sinpi(0.5 - x)
+ range reduction and offset into sinpi range -1/2 .. 1/2
+ ar_c = 0.5 - |x - n|. */
+ float ar_c = 0.5f - fabsf (ar_s);
+
+ float ar2_s = ar_s * ar_s;
+ float ar2_c = ar_c * ar_c;
+ float ar4_s = ar2_s * ar2_s;
+ float ar4_c = ar2_c * ar2_c;
+
+ uint32_t cc_sign = lrintf (x) << 31;
+ uint32_t ss_sign = cc_sign;
+ if (ar_s == 0)
+ ss_sign = sign;
+
+ /* As all values are reduced to -1/2 .. 1/2, the result of cos(x)
+ always be positive, therefore, the sign must be introduced
+ based upon if x rounds to odd or even. For sin(x) the sign is
+ copied from x. */
+ *out_sin = pw_horner_5_f32 (ar2_s, ar4_s, d->poly)
+ * asfloat (asuint (ar_s) ^ ss_sign);
+ *out_cos = pw_horner_5_f32 (ar2_c, ar4_c, d->poly)
+ * asfloat (asuint (ar_c) ^ cc_sign);
+ return;
+ }
+ else
+ {
+ /* When abs(x) > 0x1p22, the x will be either
+ - Half integer (relevant if abs(x) in [0x1p22, 0x1p23])
+ - Odd integer (relevant if abs(x) in [0x1p22, 0x1p24])
+ - Even integer (relevant if abs(x) in [0x1p22, inf])
+ - Inf or NaN. */
+ if (abstop12 (x) >= 0x7f8)
+ {
+ float inv_result = __math_invalidf (x);
+ *out_sin = inv_result;
+ *out_cos = inv_result;
+ return;
+ }
+ else
+ {
+ uint32_t ax = asuint (x) & 0x7fffffff;
+ int m = checkint (ax);
+ if (m & 1)
+ {
+ *out_sin = sign ? -m : m;
+ *out_cos = 0;
+ return;
+ }
+ else
+ {
+ *out_sin = asfloat (sign);
+ *out_cos = m >> 1;
+ return;
+ }
+ }
+ }
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (arm_math_sincospif_sin)
+TEST_DISABLE_FENV (arm_math_sincospif_cos)
+TEST_ULP (arm_math_sincospif_sin, 2.54)
+TEST_ULP (arm_math_sincospif_cos, 2.68)
+# define SINCOSPIF_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (arm_math_sincospif_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (arm_math_sincospif_cos, lo, hi, n)
+SINCOSPIF_INTERVAL (0, 0x1p-31, 10000)
+SINCOSPIF_INTERVAL (0x1p-31, 1, 50000)
+SINCOSPIF_INTERVAL (1, 0x1p22f, 50000)
+SINCOSPIF_INTERVAL (0x1p22f, inf, 10000)
+#endif
diff --git a/pl/math/sinpi_3u.c b/math/aarch64/sinpi_3u5.c
index a04a352a62e6..f96d9a312b53 100644
--- a/pl/math/sinpi_3u.c
+++ b/math/aarch64/sinpi_3u5.c
@@ -1,7 +1,7 @@
/*
* Double-precision scalar sinpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,8 +9,8 @@
#include <math.h>
#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "poly_scalar_f64.h"
/* Taylor series coefficents for sin(pi * x).
@@ -25,15 +25,17 @@ static const double poly[]
-0x1.012a9870eeb7dp-25 };
#define Shift 0x1.8p+52
+/* TODO Store constant in structure for more efficient load. */
+#define Pi 0x1.921fb54442d18p+1
/* Approximation for scalar double-precision sinpi(x).
Maximum error: 3.03 ULP:
sinpi(0x1.a90da2818f8b5p+7) got 0x1.fe358f255a4b3p-1
want 0x1.fe358f255a4b6p-1. */
double
-sinpi (double x)
+arm_math_sinpi (double x)
{
- if (isinf (x))
+ if (isinf (x) || isnan (x))
return __math_invalid (x);
double r = asdouble (asuint64 (x) & ~0x8000000000000000);
@@ -42,17 +44,17 @@ sinpi (double x)
/* Edge cases for when sinpif should be exactly 0. (Integers)
0x1p53 is the limit for single precision to store any decimal places. */
if (r >= 0x1p53)
- return 0;
+ return asdouble (sign);
/* If x is an integer, return 0. */
uint64_t m = (uint64_t) r;
if (r == m)
- return 0;
+ return asdouble (sign);
/* For very small inputs, squaring r causes underflow.
Values below this threshold can be approximated via sinpi(x) ≈ pi*x. */
if (r < 0x1p-63)
- return M_PI * x;
+ return Pi * x;
/* Any non-integer values >= 0x1x51 will be int + 0.5.
These values should return exactly 1 or -1. */
@@ -82,9 +84,18 @@ sinpi (double x)
return asdouble (asuint64 (y) ^ sign);
}
-PL_SIG (S, D, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (sinpi, 2.53)
-PL_TEST_SYM_INTERVAL (sinpi, 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (sinpi, 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (sinpi, 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (sinpi, 0x1p51, inf, 10000)
+#if WANT_EXPERIMENTAL_MATH
+double
+sinpi (double x)
+{
+ return arm_math_sinpi (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_sinpi, 2.53)
+TEST_SYM_INTERVAL (arm_math_sinpi, 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (arm_math_sinpi, 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (arm_math_sinpi, 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (arm_math_sinpi, 0x1p51, inf, 10000)
+#endif
diff --git a/pl/math/sinpif_2u5.c b/math/aarch64/sinpif_2u5.c
index af9ca0573b37..b5d9cd914577 100644
--- a/pl/math/sinpif_2u5.c
+++ b/math/aarch64/sinpif_2u5.c
@@ -1,14 +1,14 @@
/*
* Single-precision scalar sinpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "mathlib.h"
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Taylor series coefficents for sin(pi * x). */
#define C0 0x1.921fb6p1f
@@ -25,9 +25,9 @@
sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1
want 0x1.fa8c02p-1. */
float
-sinpif (float x)
+arm_math_sinpif (float x)
{
- if (isinf (x))
+ if (isinf (x) || isnan (x))
return __math_invalidf (x);
float r = asfloat (asuint (x) & ~0x80000000);
@@ -36,11 +36,11 @@ sinpif (float x)
/* Edge cases for when sinpif should be exactly 0. (Integers)
0x1p23 is the limit for single precision to store any decimal places. */
if (r >= 0x1p23f)
- return 0;
+ return asfloat (sign);
int32_t m = roundf (r);
if (m == r)
- return 0;
+ return asfloat (sign);
/* For very small inputs, squaring r causes underflow.
Values below this threshold can be approximated via sinpi(x) ~= pi*x. */
@@ -75,9 +75,18 @@ sinpif (float x)
return asfloat (asuint (y * r) ^ sign);
}
-PL_SIG (S, F, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (sinpif, 1.99)
-PL_TEST_SYM_INTERVAL (sinpif, 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (sinpif, 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (sinpif, 0.5, 0x1p22f, 10000)
-PL_TEST_SYM_INTERVAL (sinpif, 0x1p22f, inf, 10000)
+#if WANT_EXPERIMENTAL_MATH
+float
+sinpif (float x)
+{
+ return arm_math_sinpif (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_sinpif, 1.99)
+TEST_SYM_INTERVAL (arm_math_sinpif, 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (arm_math_sinpif, 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (arm_math_sinpif, 0.5, 0x1p22f, 10000)
+TEST_SYM_INTERVAL (arm_math_sinpif, 0x1p22f, inf, 10000)
+#endif
diff --git a/pl/math/sv_acos_2u.c b/math/aarch64/sve/acos.c
index e06db6cae6af..da633392aa3e 100644
--- a/pl/math/sv_acos_2u.c
+++ b/math/aarch64/sve/acos.c
@@ -1,14 +1,14 @@
/*
* Double-precision SVE acos(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -82,10 +82,12 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
return svmla_x (pg, add, mul, y);
}
-PL_SIG (SV, D, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (SV_NAME_D1 (acos), 1.02)
-PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000)
+TEST_SIG (SV, D, 1, acos, -1.0, 1.0)
+TEST_ULP (SV_NAME_D1 (acos), 1.02)
+TEST_DISABLE_FENV (SV_NAME_D1 (acos))
+TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000)
+TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_acosf_1u4.c b/math/aarch64/sve/acosf.c
index 7ac59ceedfbd..86b7822cefc3 100644
--- a/pl/math/sv_acosf_1u4.c
+++ b/math/aarch64/sve/acosf.c
@@ -1,14 +1,14 @@
/*
* Single-precision SVE acos(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -75,10 +75,12 @@ svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg)
return svmla_x (pg, add, mul, y);
}
-PL_SIG (SV, F, 1, acos, -1.0, 1.0)
-PL_TEST_ULP (SV_NAME_F1 (acos), 0.82)
-PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000)
+TEST_SIG (SV, F, 1, acos, -1.0, 1.0)
+TEST_ULP (SV_NAME_F1 (acos), 0.82)
+TEST_DISABLE_FENV (SV_NAME_F1 (acos))
+TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000)
+TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000)
+TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/acosh.c b/math/aarch64/sve/acosh.c
new file mode 100644
index 000000000000..d54c21922e1b
--- /dev/null
+++ b/math/aarch64/sve/acosh.c
@@ -0,0 +1,51 @@
+/*
+ * Double-precision SVE acosh(x) function.
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 1
+#include "sv_log1p_inline.h"
+
+#define One (0x3ff0000000000000)
+#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (acosh, x, y, special);
+}
+
+/* SVE approximation for double-precision acosh, based on log1p.
+ The largest observed error is 3.19 ULP in the region where the
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
+ SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
+ want 0x1.ed23399f51373p-2. */
+svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
+{
+ /* (ix - One) >= (BigBound - One). */
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+ svfloat64_t xm1 = svsub_x (pg, x, 1.0);
+ svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0));
+ svfloat64_t y = svadd_x (pg, xm1, svsqrt_x (pg, u));
+
+ /* Fall back to scalar routine for special lanes. */
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, sv_log1p_inline (y, pg), special);
+ return sv_log1p_inline (y, pg);
+}
+
+TEST_SIG (SV, D, 1, acosh, 1.0, 10.0)
+TEST_ULP (SV_NAME_D1 (acosh), 2.69)
+TEST_DISABLE_FENV (SV_NAME_D1 (acosh))
+TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000)
+TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000)
+TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000)
+TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/acoshf.c b/math/aarch64/sve/acoshf.c
new file mode 100644
index 000000000000..f48ef724e8eb
--- /dev/null
+++ b/math/aarch64/sve/acoshf.c
@@ -0,0 +1,51 @@
+/*
+ * Single-precision SVE acosh(x) function.
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define One 0x3f800000
+#define Thres 0x20000000 /* asuint(0x1p64) - One. */
+
+#include "sv_log1pf_inline.h"
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
+{
+ svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
+ svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
+ return sv_call_f32 (acoshf, x, y, special);
+}
+
+/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
+ vector acoshf and log1p.
+
+ Maximum error is 2.47 ULPs:
+ SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
+ want 0x1.e435a2p-4. */
+svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+{
+ svuint32_t ix = svreinterpret_u32 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+ svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
+ svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
+ svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (xm1, tmp, special);
+ return sv_log1pf_inline (tmp, pg);
+}
+
+TEST_SIG (SV, F, 1, acosh, 1.0, 10.0)
+TEST_ULP (SV_NAME_F1 (acosh), 1.97)
+TEST_DISABLE_FENV (SV_NAME_F1 (acosh))
+TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500)
+TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000)
+TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000)
+TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_asin_3u.c b/math/aarch64/sve/asin.c
index c3dd37b145ae..cac629afae15 100644
--- a/pl/math/sv_asin_3u.c
+++ b/math/aarch64/sve/asin.c
@@ -1,14 +1,14 @@
/*
* Double-precision SVE asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -42,8 +42,8 @@ static const struct data
asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
The largest observed error in this region is 2.69 ulps,
- _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
- want 0x1.110d7e85fdd53p-1. */
+ _ZGVsMxv_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+ want 0x1.1111dd54ddf99p-1. */
svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
@@ -75,10 +75,12 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
}
-PL_SIG (SV, D, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (SV_NAME_D1 (asin), 2.19)
-PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000)
+TEST_SIG (SV, D, 1, asin, -1.0, 1.0)
+TEST_ULP (SV_NAME_D1 (asin), 2.20)
+TEST_DISABLE_FENV (SV_NAME_D1 (asin))
+TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000)
+TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_asinf_2u5.c b/math/aarch64/sve/asinf.c
index 8e9edc2439f5..fe94feba7a42 100644
--- a/pl/math/sv_asinf_2u5.c
+++ b/math/aarch64/sve/asinf.c
@@ -1,14 +1,14 @@
/*
* Single-precision SVE asin(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -67,10 +67,12 @@ svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
}
-PL_SIG (SV, F, 1, asin, -1.0, 1.0)
-PL_TEST_ULP (SV_NAME_F1 (asin), 1.91)
-PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000)
-PL_TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) \ No newline at end of file
+TEST_SIG (SV, F, 1, asin, -1.0, 1.0)
+TEST_ULP (SV_NAME_F1 (asin), 1.91)
+TEST_DISABLE_FENV (SV_NAME_F1 (asin))
+TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000)
+TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000)
+TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/asinh.c b/math/aarch64/sve/asinh.c
new file mode 100644
index 000000000000..5574116de1e1
--- /dev/null
+++ b/math/aarch64/sve/asinh.c
@@ -0,0 +1,197 @@
+/*
+ * Double-precision SVE asinh(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define SignMask (0x8000000000000000)
+#define One (0x3ff0000000000000)
+#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */
+#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
+
+static const struct data
+{
+ double even_coeffs[9];
+ double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
+ uint64_t off, mask;
+
+} data = {
+ /* Polynomial generated using Remez on [2^-26, 1]. */
+ .even_coeffs ={
+ -0x1.55555555554a7p-3,
+ -0x1.6db6db68332e6p-5,
+ -0x1.6e8b8b654a621p-6,
+ -0x1.c9871d10885afp-7,
+ -0x1.3ddca533e9f54p-7,
+ -0x1.b90c7099dd397p-8,
+ -0x1.d217026a669ecp-9,
+ -0x1.e0f37daef9127p-11,
+ -0x1.021a48685e287p-14, },
+
+ .c1 = 0x1.3333333326c7p-4,
+ .c3 = 0x1.f1c71b26fb40dp-6,
+ .c5 = 0x1.1c4daa9e67871p-6,
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c9 = 0x1.0becef748dafcp-7,
+ .c11 = 0x1.541f2bb1ffe51p-8,
+ .c13 = 0x1.0b5c7977aaf7p-9,
+ .c15 = 0x1.388b5fe542a6p-12,
+ .c17 = 0x1.93d4ba83d34dap-18,
+
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .p0 = -0x1.ffffffffffff7p-2,
+ .p1 = 0x1.55555555170d4p-2,
+ .p2 = -0x1.0000000399c27p-2,
+ .p3 = 0x1.999b2e90e94cap-3,
+ .p4 = -0x1.554e550bd501ep-3,
+ .off = 0x3fe6900900000000,
+ .mask = 0xfffULL << 52,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (asinh, x, y, special);
+}
+
+static inline svfloat64_t
+__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+{
+ /* Double-precision SVE log, copied from SVE log implementation with some
+ cosmetic modification and special-cases removed. See that file for details
+ of the algorithm used. */
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t i_off = svsub_x (pg, ix, d->off);
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
+ svfloat64_t z = svreinterpret_f64 (iz);
+
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+
+ svfloat64_t ln2_p3 = svld1rq (svptrue_b64 (), &d->ln2);
+ svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
+
+ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
+ svfloat64_t kd
+ = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
+
+ svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
+ svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
+
+ y = svmla_lane (y, r2, p1_p4, 1);
+ y = svmla_x (pg, p, r2, y);
+ y = svmla_x (pg, hi, r2, y);
+ return y;
+}
+
+/* Double-precision implementation of SVE asinh(x).
+ asinh is very sensitive around 1, so it is impractical to devise a single
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
+ Instead we use two different algorithms:
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
+ shared with the scalar routine. The greatest observed error 2.51 ULP, in
+ |x| >= 1:
+ _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1
+ want 0x1.e3181c43b0f39p-1. */
+svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t iax = svbic_x (pg, ix, SignMask);
+ svuint64_t sign = svand_x (pg, ix, SignMask);
+ svfloat64_t ax = svreinterpret_f64 (iax);
+ svbool_t ge1 = svcmpge (pg, iax, One);
+ svbool_t special = svcmpge (pg, iax, Thres);
+
+ /* Option 1: |x| >= 1.
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */
+ svfloat64_t option_1 = sv_f64 (0);
+ if (likely (svptest_any (pg, ge1)))
+ {
+ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+ option_1 = __sv_log_inline (
+ svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
+ }
+
+ /* Option 2: |x| < 1.
+ Compute asinh(x) using a polynomial.
+ The largest observed error in this region is 1.51 ULPs:
+ _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
+ want 0x1.c1e649ee2681dp-1. */
+
+ svfloat64_t option_2 = sv_f64 (0);
+ if (likely (svptest_any (pg, svnot_z (pg, ge1))))
+ {
+ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+ svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
+ /* Order-17 Pairwise Horner scheme. */
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
+ svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
+ svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
+ svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
+ svfloat64_t p1213
+ = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
+ svfloat64_t p1415
+ = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
+ svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
+
+ svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
+ p = svmla_x (pg, p1213, x4, p);
+ p = svmla_x (pg, p1011, x4, p);
+ p = svmla_x (pg, p89, x4, p);
+
+ p = svmla_x (pg, p67, x4, p);
+ p = svmla_x (pg, p45, x4, p);
+
+ p = svmla_x (pg, p23, x4, p);
+
+ p = svmla_x (pg, p01, x4, p);
+
+ option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
+ }
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (
+ x,
+ svreinterpret_f64 (sveor_x (
+ pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
+ special);
+
+ /* Choose the right option for each lane. */
+ svfloat64_t y = svsel (ge1, option_1, option_2);
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+}
+
+TEST_SIG (SV, D, 1, asinh, -10.0, 10.0)
+TEST_ULP (SV_NAME_D1 (asinh), 2.52)
+TEST_DISABLE_FENV (SV_NAME_D1 (asinh))
+TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0, 0x1p-26, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0x1p-26, 1, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 1, 0x1p511, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0x1p511, inf, 40000)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+ Ensures the v_sel is choosing the right option in all cases. */
+TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 0.5)
+TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 2)
+TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 0x1p600)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_asinhf_2u5.c b/math/aarch64/sve/asinhf.c
index 1f1f6e5c846f..32aedbfd3a6d 100644
--- a/pl/math/sv_asinhf_2u5.c
+++ b/math/aarch64/sve/asinhf.c
@@ -1,31 +1,33 @@
/*
* Single-precision SVE asinh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "include/mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "sv_log1pf_inline.h"
-#define BigBound (0x5f800000) /* asuint(0x1p64). */
+#define BigBound 0x5f800000 /* asuint(0x1p64). */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
{
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+ y = svreinterpret_f32 (
+ svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
return sv_call_f32 (asinhf, x, y, special);
}
/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
vector asinhf and log1p.
- Maximum error is 2.48 ULPs:
- SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
- want 0x1.ffbbb8p-4. */
+ Maximum error is 1.92 ULPs:
+ SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
+ want -0x1.fd0bc8p-2. */
svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
{
svfloat32_t ax = svabs_x (pg, x);
@@ -41,15 +43,15 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
= sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
if (unlikely (svptest_any (pg, special)))
- return special_case (
- x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
- special);
+ return special_case (iax, sign, y, special);
return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
}
-PL_SIG (SV, F, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (asinh), 1.98)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000)
+TEST_SIG (SV, F, 1, asinh, -10.0, 10.0)
+TEST_ULP (SV_NAME_F1 (asinh), 1.43)
+TEST_DISABLE_FENV (SV_NAME_F1 (asinh))
+TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_atan_2u5.c b/math/aarch64/sve/atan.c
index 7ab486a4c9d2..73fc29a94f23 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/math/aarch64/sve/atan.c
@@ -1,14 +1,14 @@
/*
* Double-precision vector atan(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f64.h"
static const struct data
{
@@ -79,9 +79,11 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
return y;
}
-PL_SIG (SV, D, 1, atan, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_D1 (atan), 1.78)
-PL_TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000)
+TEST_SIG (SV, D, 1, atan, -3.1, 3.1)
+TEST_ULP (SV_NAME_D1 (atan), 1.78)
+TEST_DISABLE_FENV (SV_NAME_D1 (atan))
+TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000)
+TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000)
+TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000)
+TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_atan2_2u5.c b/math/aarch64/sve/atan2.c
index 00530a324a76..1e1d00678b1d 100644
--- a/pl/math/sv_atan2_2u5.c
+++ b/math/aarch64/sve/atan2.c
@@ -1,14 +1,14 @@
/*
* Double-precision vector atan2(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f64.h"
static const struct data
{
@@ -27,9 +27,6 @@ static const struct data
.pi_over_2 = 0x1.921fb54442d18p+0,
};
-/* Useful constants. */
-#define SignMask sv_u64 (0x8000000000000000)
-
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
static svfloat64_t NOINLINE
special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
@@ -51,7 +48,8 @@ zeroinfnan (svuint64_t i, const svbool_t pg)
x are reasonably close together. The greatest observed error is 2.28 ULP:
_ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */
-svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
+svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x,
+ const svbool_t pg)
{
const struct data *data_ptr = ptr_barrier (&data);
@@ -62,14 +60,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
svbool_t cmp_y = zeroinfnan (iy, pg);
svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
- svuint64_t sign_x = svand_x (pg, ix, SignMask);
- svuint64_t sign_y = svand_x (pg, iy, SignMask);
- svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
-
svfloat64_t ax = svabs_x (pg, x);
svfloat64_t ay = svabs_x (pg, y);
+ svuint64_t iax = svreinterpret_u64 (ax);
+ svuint64_t iay = svreinterpret_u64 (ay);
+
+ svuint64_t sign_x = sveor_x (pg, ix, iax);
+ svuint64_t sign_y = sveor_x (pg, iy, iay);
+ svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
- svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */
@@ -78,8 +77,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
svfloat64_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */
- svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
- shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
+ shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
+ shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
shift = svmul_x (pg, shift, data_ptr->pi_over_2);
/* Use split Estrin scheme for P(z^2) with deg(P)=19. */
@@ -99,18 +99,20 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */
- ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
-
if (unlikely (svptest_any (pg, cmp_xy)))
- return special_case (y, x, ret, cmp_xy);
-
- return ret;
+ return special_case (
+ y, x,
+ svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)),
+ cmp_xy);
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
}
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
-PL_SIG (SV, D, 2, atan2)
-PL_TEST_ULP (SV_NAME_D2 (atan2), 1.78)
-PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000)
-PL_TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000)
+TEST_SIG (SV, D, 2, atan2)
+TEST_ULP (SV_NAME_D2 (atan2), 1.78)
+TEST_DISABLE_FENV (SV_NAME_D2 (atan2))
+TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000)
+TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_atan2f_3u.c b/math/aarch64/sve/atan2f.c
index 9ff73ecb74ba..563b708cfcbb 100644
--- a/pl/math/sv_atan2f_3u.c
+++ b/math/aarch64/sve/atan2f.c
@@ -1,14 +1,14 @@
/*
* Single-precision vector atan2f(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
static const struct data
{
@@ -22,10 +22,8 @@ static const struct data
.pi_over_2 = 0x1.921fb6p+0f,
};
-#define SignMask sv_u32 (0x80000000)
-
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
-static inline svfloat32_t
+static svfloat32_t NOINLINE
special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
const svbool_t cmp)
{
@@ -46,7 +44,8 @@ zeroinfnan (svuint32_t i, const svbool_t pg)
observed error is 2.95 ULP:
_ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
want 0x1.967f00p-1. */
-svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
+svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x,
+ const svbool_t pg)
{
const struct data *data_ptr = ptr_barrier (&data);
@@ -57,14 +56,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
svbool_t cmp_y = zeroinfnan (iy, pg);
svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
- svuint32_t sign_x = svand_x (pg, ix, SignMask);
- svuint32_t sign_y = svand_x (pg, iy, SignMask);
- svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
-
svfloat32_t ax = svabs_x (pg, x);
svfloat32_t ay = svabs_x (pg, y);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t iay = svreinterpret_u32 (ay);
+
+ svuint32_t sign_x = sveor_x (pg, ix, iax);
+ svuint32_t sign_y = sveor_x (pg, iy, iay);
+ svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
- svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */
@@ -73,11 +73,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
svfloat32_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */
- svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
- shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
+ shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
+ shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
- /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
+ /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */
svfloat32_t z2 = svmul_x (pg, z, z);
svfloat32_t z4 = svmul_x (pg, z2, z2);
svfloat32_t z8 = svmul_x (pg, z4, z4);
@@ -91,18 +92,22 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */
- ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
if (unlikely (svptest_any (pg, cmp_xy)))
- return special_case (y, x, ret, cmp_xy);
+ return special_case (
+ y, x,
+ svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)),
+ cmp_xy);
- return ret;
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
}
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
-PL_SIG (SV, F, 2, atan2)
-PL_TEST_ULP (SV_NAME_F2 (atan2), 2.45)
-PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000)
-PL_TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000)
+TEST_SIG (SV, F, 2, atan2)
+TEST_ULP (SV_NAME_F2 (atan2), 2.45)
+TEST_DISABLE_FENV (SV_NAME_F2 (atan2))
+TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000)
+TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000)
+TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000)
+TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_atanf_2u9.c b/math/aarch64/sve/atanf.c
index 4defb356e7f9..a2cd37b12744 100644
--- a/pl/math/sv_atanf_2u9.c
+++ b/math/aarch64/sve/atanf.c
@@ -1,14 +1,14 @@
/*
* Single-precision vector atan(x) function.
*
- * Copyright (c) 2021-2023, Arm Limited.
+ * Copyright (c) 2021-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
static const struct data
{
@@ -68,9 +68,11 @@ svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
}
-PL_SIG (SV, F, 1, atan, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_F1 (atan), 2.9)
-PL_TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000)
+TEST_SIG (SV, F, 1, atan, -3.1, 3.1)
+TEST_ULP (SV_NAME_F1 (atan), 2.9)
+TEST_DISABLE_FENV (SV_NAME_F1 (atan))
+TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000)
+TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000)
+TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000)
+TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_atanh_3u3.c b/math/aarch64/sve/atanh.c
index dcc9350b4962..b404df56fd75 100644
--- a/pl/math/sv_atanh_3u3.c
+++ b/math/aarch64/sve/atanh.c
@@ -1,13 +1,13 @@
/*
* Double-precision SVE atanh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define WANT_SV_LOG1P_K0_SHORTCUT 0
#include "sv_log1p_inline.h"
@@ -34,7 +34,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
/* It is special if iax >= 1. */
-// svbool_t special = svcmpge (pg, iax, One);
svbool_t special = svacge (pg, x, 1.0);
/* Computation is performed based on the following sequence of equality:
@@ -50,11 +49,14 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
return svmul_x (pg, halfsign, y);
}
-PL_SIG (SV, D, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (SV_NAME_D1 (atanh), 3.32)
+TEST_SIG (SV, D, 1, atanh, -1.0, 1.0)
+TEST_ULP (SV_NAME_D1 (atanh), 3.32)
+TEST_DISABLE_FENV (SV_NAME_D1 (atanh))
+TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 1, inf, 100)
/* atanh is asymptotic at 1, which is the default control value - have to set
- -c 0 specially to ensure fp exceptions are triggered correctly (choice of
- control lane is irrelevant if fp exceptions are disabled). */
-PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
-PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
-PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 1, inf, 100, 0)
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+TEST_CONTROL_VALUE (SV_NAME_D1 (atanh), 0)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_atanhf_2u8.c b/math/aarch64/sve/atanhf.c
index 413c60ce05da..2e10a8cd22f7 100644
--- a/pl/math/sv_atanhf_2u8.c
+++ b/math/aarch64/sve/atanhf.c
@@ -1,14 +1,13 @@
/*
* Single-precision vector atanh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#include "sv_log1pf_inline.h"
@@ -16,15 +15,18 @@
#define Half (0x3f000000)
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
+ svfloat32_t y, svbool_t special)
{
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+ y = svmul_x (svptrue_b32 (), halfsign, y);
return sv_call_f32 (atanhf, x, y, special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 2.28 ULP:
- _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
- want 0x1.ffbbb6p-5. */
+ The maximum error is 1.99 ULP:
+ _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
+ want 0x1.f1f4f6p-5. */
svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
{
svfloat32_t ax = svabs_x (pg, x);
@@ -41,16 +43,19 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
y = sv_log1pf_inline (y, pg);
if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmul_x (pg, halfsign, y), special);
+ return special_case (iax, sign, halfsign, y, special);
return svmul_x (pg, halfsign, y);
}
-PL_SIG (SV, F, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (SV_NAME_F1 (atanh), 2.59)
+TEST_SIG (SV, F, 1, atanh, -1.0, 1.0)
+TEST_ULP (SV_NAME_F1 (atanh), 1.50)
+TEST_DISABLE_FENV (SV_NAME_F1 (atanh))
+TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 1, inf, 1000)
/* atanh is asymptotic at 1, which is the default control value - have to set
-c 0 specially to ensure fp exceptions are triggered correctly (choice of
control lane is irrelevant if fp exceptions are disabled). */
-PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000, 0)
-PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000, 0)
-PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 1, inf, 1000, 0)
+TEST_CONTROL_VALUE (SV_NAME_F1 (atanh), 0)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cbrt_2u.c b/math/aarch64/sve/cbrt.c
index 192f1cd80d59..3e6a972463f0 100644
--- a/pl/math/sv_cbrt_2u.c
+++ b/math/aarch64/sve/cbrt.c
@@ -1,14 +1,14 @@
/*
* Double-precision SVE cbrt(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f64.h"
const static struct data
{
@@ -48,10 +48,16 @@ shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i)
}
/* Approximation for double-precision vector cbrt(x), using low-order
- polynomial and two Newton iterations. Greatest observed error is 1.79 ULP.
- Errors repeat according to the exponent, for instance an error observed for
- double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i
- is an integer.
+ polynomial and two Newton iterations.
+
+ The vector version of frexp does not handle subnormals
+ correctly. As a result these need to be handled by the scalar
+ fallback, where accuracy may be worse than that of the vector code
+ path.
+
+ Greatest observed error in the normal range is 1.79 ULP. Errors repeat
+ according to the exponent, for instance an error observed for double value m
+ * 2^e will be observed for any input m * 2^(e + 3*i), where i is an integer.
_ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342
want 0x1.965f53b0e5d95p-342. */
svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg)
@@ -117,6 +123,13 @@ svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg)
return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
}
-PL_SIG (SV, D, 1, cbrt, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (cbrt), 1.30)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000)
+/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which
+ has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error
+ in the vector path is 1.79 ULP.
+ [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical
+ Functions in Single, Double, Double Extended, and Quadruple Precision. */
+TEST_SIG (SV, D, 1, cbrt, -10.0, 10.0)
+TEST_ULP (SV_NAME_D1 (cbrt), 3.17)
+TEST_DISABLE_FENV (SV_NAME_D1 (cbrt))
+TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cbrtf_1u7.c b/math/aarch64/sve/cbrtf.c
index 5b625f308827..afdace7865f1 100644
--- a/pl/math/sv_cbrtf_1u7.c
+++ b/math/aarch64/sve/cbrtf.c
@@ -1,14 +1,14 @@
/*
* Single-precision SVE cbrt(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
const static struct data
{
@@ -111,6 +111,8 @@ svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
}
-PL_SIG (SV, F, 1, cbrt, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (cbrt), 1.15)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000)
+TEST_SIG (SV, F, 1, cbrt, -10.0, 10.0)
+TEST_ULP (SV_NAME_F1 (cbrt), 1.15)
+TEST_DISABLE_FENV (SV_NAME_F1 (cbrt))
+TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cexpi_3u5.c b/math/aarch64/sve/cexpi.c
index 920acfea5da0..0ccd110484c8 100644
--- a/pl/math/sv_cexpi_3u5.c
+++ b/math/aarch64/sve/cexpi.c
@@ -1,13 +1,13 @@
/*
* Double-precision vector cexpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "sv_sincos_common.h"
#include "sv_math.h"
-#include "pl_test.h"
+#include "sv_sincos_common.h"
+#include "test_defs.h"
static svfloat64x2_t NOINLINE
special_case (svfloat64_t x, svbool_t special, svfloat64x2_t y)
@@ -34,12 +34,15 @@ _ZGVsMxv_cexpi (svfloat64_t x, svbool_t pg)
return sc;
}
-PL_TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73)
-PL_TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73)
+TEST_DISABLE_FENV (_ZGVsMxv_cexpi_sin)
+TEST_DISABLE_FENV (_ZGVsMxv_cexpi_cos)
+TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73)
+TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73)
#define SV_CEXPI_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n)
+ TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n)
SV_CEXPI_INTERVAL (0, 0x1p23, 500000)
SV_CEXPI_INTERVAL (-0, -0x1p23, 500000)
SV_CEXPI_INTERVAL (0x1p23, inf, 10000)
SV_CEXPI_INTERVAL (-0x1p23, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cexpif_1u8.c b/math/aarch64/sve/cexpif.c
index 93f2f998cb38..fd07ce553cd8 100644
--- a/pl/math/sv_cexpif_1u8.c
+++ b/math/aarch64/sve/cexpif.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector cexpi function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "sv_sincosf_common.h"
#include "sv_math.h"
-#include "pl_test.h"
+#include "sv_sincosf_common.h"
+#include "test_defs.h"
static svfloat32x2_t NOINLINE
special_case (svfloat32_t x, svbool_t special, svfloat32x2_t y)
@@ -36,12 +36,15 @@ _ZGVsMxv_cexpif (svfloat32_t x, svbool_t pg)
return sc;
}
-PL_TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17)
-PL_TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31)
+TEST_DISABLE_FENV (_ZGVsMxv_cexpif_sin)
+TEST_DISABLE_FENV (_ZGVsMxv_cexpif_cos)
+TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17)
+TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31)
#define SV_CEXPIF_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n)
+ TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \
+ TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n)
SV_CEXPIF_INTERVAL (0, 0x1p20, 500000)
SV_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
SV_CEXPIF_INTERVAL (0x1p20, inf, 10000)
SV_CEXPIF_INTERVAL (-0x1p20, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cos_2u5.c b/math/aarch64/sve/cos.c
index 76af3459b3f2..93e93674a98a 100644
--- a/pl/math/sv_cos_2u5.c
+++ b/math/aarch64/sve/cos.c
@@ -1,13 +1,13 @@
/*
* Double-precision SVE cos(x) function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -80,7 +80,9 @@ svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg)
return svmul_x (pg, f, y);
}
-PL_SIG (SV, D, 1, cos, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_D1 (cos), 1.61)
-PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000)
+TEST_SIG (SV, D, 1, cos, -3.1, 3.1)
+TEST_ULP (SV_NAME_D1 (cos), 1.61)
+TEST_DISABLE_FENV (SV_NAME_D1 (cos))
+TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000)
+TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cosf_2u1.c b/math/aarch64/sve/cosf.c
index 4bdb0dd146bb..7d18f8c2ad21 100644
--- a/pl/math/sv_cosf_2u1.c
+++ b/math/aarch64/sve/cosf.c
@@ -1,13 +1,13 @@
/*
* Single-precision SVE cos(x) function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -74,7 +74,9 @@ svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg)
return svmul_x (pg, f, y);
}
-PL_SIG (SV, F, 1, cos, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_F1 (cos), 1.57)
-PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000)
+TEST_SIG (SV, F, 1, cos, -3.1, 3.1)
+TEST_ULP (SV_NAME_F1 (cos), 1.57)
+TEST_DISABLE_FENV (SV_NAME_F1 (cos))
+TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000)
+TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cosh_2u.c b/math/aarch64/sve/cosh.c
index a6d743fb9b96..775854cfbe5a 100644
--- a/pl/math/sv_cosh_2u.c
+++ b/math/aarch64/sve/cosh.c
@@ -1,19 +1,19 @@
/*
* Double-precision SVE cosh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2025, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float64_t poly[3];
float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
- uint64_t index_mask, special_bound;
+ uint64_t special_bound;
} data = {
.poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
0x1.5555576a59599p-5, },
@@ -25,14 +25,16 @@ static const struct data
.shift = 0x1.8p+52,
.thres = 704.0,
- .index_mask = 0xff,
/* 0x1.6p9, above which exp overflows. */
.special_bound = 0x4086000000000000,
};
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
{
+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+ svfloat64_t y = svadd_x (pg, half_t, half_over_t);
return sv_call_f64 (cosh, x, y, special);
}
@@ -50,12 +52,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
svuint64_t u = svreinterpret_u64 (z);
svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
- svuint64_t i = svand_x (pg, u, d->index_mask);
+ svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
y = svmla_x (pg, sv_f64 (1.0), r, y);
- y = svmul_x (pg, r, y);
+ y = svmul_x (svptrue_b64 (), r, y);
/* s = 2^(n/N). */
u = svld1_gather_index (pg, __v_exp_tail_data, i);
@@ -84,17 +86,19 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
/* Up to the point that exp overflows, we can use it to calculate cosh by
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
svfloat64_t t = exp_inline (ax, pg, d);
- svfloat64_t half_t = svmul_x (pg, t, 0.5);
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
/* Fall back to scalar for any special cases. */
if (unlikely (svptest_any (pg, special)))
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+ return special_case (x, pg, t, special);
+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
return svadd_x (pg, half_t, half_over_t);
}
-PL_SIG (SV, D, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (cosh), 1.43)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
+TEST_SIG (SV, D, 1, cosh, -10.0, 10.0)
+TEST_ULP (SV_NAME_D1 (cosh), 1.43)
+TEST_DISABLE_FENV (SV_NAME_D1 (cosh))
+TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
+CLOSE_SVE_ATTR \ No newline at end of file
diff --git a/math/aarch64/sve/coshf.c b/math/aarch64/sve/coshf.c
new file mode 100644
index 000000000000..b79fed2374b5
--- /dev/null
+++ b/math/aarch64/sve/coshf.c
@@ -0,0 +1,62 @@
+/*
+ * Single-precision SVE cosh(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_expf_inline.h"
+
+static const struct data
+{
+ struct sv_expf_data expf_consts;
+ float special_bound;
+} data = {
+ .expf_consts = SV_EXPF_DATA,
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .special_bound = 0x1.5a92d8p+6,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+ svbool_t pg)
+{
+ return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
+ pg);
+}
+
+/* Single-precision vector cosh, using vector expf.
+ Maximum error is 2.77 ULP:
+ _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+ want 0x1.e4594cp+2. */
+svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svbool_t special = svacge (pg, x, d->special_bound);
+
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
+ Note that x is passed to exp here, rather than |x|. This is to avoid using
+ destructive unary ABS for better register usage. However it means the
+ routine is not exactly symmetrical, as the exp helper is slightly less
+ accurate in the negative range. */
+ svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
+ svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
+ svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, half_e, half_over_e, special);
+
+ return svadd_x (svptrue_b32 (), half_e, half_over_e);
+}
+
+TEST_SIG (SV, F, 1, cosh, -10.0, 10.0)
+TEST_ULP (SV_NAME_F1 (cosh), 2.28)
+TEST_DISABLE_FENV (SV_NAME_F1 (cosh))
+TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100)
+TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cospi_3u2.c b/math/aarch64/sve/cospi.c
index d80f899c41e4..9859dbe7a44c 100644
--- a/pl/math/sv_cospi_3u2.c
+++ b/math/aarch64/sve/cospi.c
@@ -1,15 +1,15 @@
/*
* Double-precision SVE cospi(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "mathlib.h"
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
+#include "mathlib.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f64.h"
static const struct data
{
@@ -55,9 +55,12 @@ svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg)
return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
}
-PL_SIG (SV, D, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (SV_NAME_D1 (cospi), 2.71)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (SV_NAME_D1 (cospi), 2.71)
+TEST_DISABLE_FENV (SV_NAME_D1 (cospi))
+TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_cospif_2u6.c b/math/aarch64/sve/cospif.c
index fb2922d0533a..d65a2b619023 100644
--- a/pl/math/sv_cospif_2u6.c
+++ b/math/aarch64/sve/cospif.c
@@ -1,15 +1,15 @@
/*
* Single-precision SVE cospi(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "mathlib.h"
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
+#include "mathlib.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
static const struct data
{
@@ -51,9 +51,12 @@ svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
}
-PL_SIG (SV, F, 1, cospi, -0.9, 0.9)
-PL_TEST_ULP (SV_NAME_F1 (cospi), 2.08)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (SV_NAME_F1 (cospi), 2.08)
+TEST_DISABLE_FENV (SV_NAME_F1 (cospi))
+TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_erf_2u5.c b/math/aarch64/sve/erf.c
index cbf9718e5bb0..ccade93e1033 100644
--- a/pl/math/sv_erf_2u5.c
+++ b/math/aarch64/sve/erf.c
@@ -1,13 +1,13 @@
/*
* Double-precision vector erf(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -57,14 +57,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
svfloat64_t a = svabs_x (pg, x);
svfloat64_t shift = sv_f64 (dat->shift);
svfloat64_t z = svadd_x (pg, a, shift);
- svuint64_t i
- = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
+ svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
+ i = svadd_x (pg, i, i);
/* Lookup without shortcut for small values but with predicate to avoid
segfault for large values and NaNs. */
svfloat64_t r = svsub_x (pg, z, shift);
- svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
- svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
+ svfloat64_t erfr
+ = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
+ svfloat64_t scale
+ = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);
/* erf(x) ~ erf(r) + scale * d * poly (r, d). */
svfloat64_t d = svsub_x (pg, a, r);
@@ -104,8 +106,10 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
return svreinterpret_f64 (svorr_x (pg, sign, iy));
}
-PL_SIG (SV, D, 1, erf, -6.0, 6.0)
-PL_TEST_ULP (SV_NAME_D1 (erf), 1.79)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000)
+TEST_SIG (SV, D, 1, erf, -6.0, 6.0)
+TEST_ULP (SV_NAME_D1 (erf), 1.79)
+TEST_DISABLE_FENV (SV_NAME_D1 (erf))
+TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_erfc_1u8.c b/math/aarch64/sve/erfc.c
index a91bef96f2e7..a85cacb1ae62 100644
--- a/pl/math/sv_erfc_1u8.c
+++ b/math/aarch64/sve/erfc.c
@@ -1,13 +1,13 @@
/*
* Double-precision vector erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -93,7 +93,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svadd_x (pg, i, i);
- const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
+ const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
svfloat64_t erfcr = svld1_gather_index (pg, p, i);
svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
@@ -155,10 +155,12 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
return svmla_x (pg, off, fac, y);
}
-PL_SIG (SV, D, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (SV_NAME_D1 (erfc), 1.21)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000)
-PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000)
+TEST_SIG (SV, D, 1, erfc, -6.0, 28.0)
+TEST_ULP (SV_NAME_D1 (erfc), 1.21)
+TEST_DISABLE_FENV (SV_NAME_D1 (erfc))
+TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000)
+TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000)
+TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_erfcf_1u7.c b/math/aarch64/sve/erfcf.c
index cda8f0b3752e..936881332291 100644
--- a/pl/math/sv_erfcf_1u7.c
+++ b/math/aarch64/sve/erfcf.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -66,23 +66,23 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
- i = svmul_x (pg, i, 2);
- const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+ i = svlsl_x (svptrue_b32 (), i, 1);
+ const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
svfloat32_t r = svsub_x (pg, z, shift);
svfloat32_t d = svsub_x (pg, a, r);
- svfloat32_t d2 = svmul_x (pg, d, d);
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
- svfloat32_t third = svdup_lane (coeffs, 0);
svfloat32_t p1 = r;
- svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
- svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+ svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
+ svfloat32_t p3
+ = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
@@ -102,10 +102,12 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
return svmla_x (pg, off, fac, y);
}
-PL_SIG (SV, F, 1, erfc, -4.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (erfc), 1.14)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000)
+TEST_SIG (SV, F, 1, erfc, -4.0, 10.0)
+TEST_ULP (SV_NAME_F1 (erfc), 1.14)
+TEST_DISABLE_FENV (SV_NAME_F1 (erfc))
+TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000)
+TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000)
+TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_erff_2u.c b/math/aarch64/sve/erff.c
index adeee798ee2e..c8c87499a63f 100644
--- a/pl/math/sv_erff_2u.c
+++ b/math/aarch64/sve/erff.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector erf(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -52,18 +52,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
svfloat32_t shift = sv_f32 (dat->shift);
svfloat32_t z = svadd_x (pg, a, shift);
- svuint32_t i
- = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
-
- /* Saturate lookup index. */
- i = svsel (a_ge_max, sv_u32 (512), i);
+ svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
+ i = svadd_x (pg, i, i);
/* r and erf(r) set to 0 for |x| below min. */
svfloat32_t r = svsub_z (a_gt_min, z, shift);
- svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
+ svfloat32_t erfr
+ = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);
/* scale set to 2/sqrt(pi) for |x| below min. */
- svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
+ svfloat32_t scale
+ = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
/* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */
@@ -82,9 +81,11 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (svorr_x (pg, sign, iy));
}
-PL_SIG (SV, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (SV_NAME_F1 (erf), 1.43)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000)
+TEST_SIG (SV, F, 1, erf, -4.0, 4.0)
+TEST_ULP (SV_NAME_F1 (erf), 1.43)
+TEST_DISABLE_FENV (SV_NAME_F1 (erf))
+TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_exp_1u5.c b/math/aarch64/sve/exp.c
index c187def9e625..b021e64ffedf 100644
--- a/pl/math/sv_exp_1u5.c
+++ b/math/aarch64/sve/exp.c
@@ -1,22 +1,25 @@
/*
* Double-precision vector e^x function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2025, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
- double poly[4];
+ double c0, c2;
+ double c1, c3;
double ln2_hi, ln2_lo, inv_ln2, shift, thres;
+
} data = {
- .poly = { /* ulp error: 0.53. */
- 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
- 0x1.1111266d28935p-7 },
+ .c0 = 0x1.fffffffffdbcdp-2,
+ .c1 = 0x1.555555555444cp-3,
+ .c2 = 0x1.555573c6a9f7dp-5,
+ .c3 = 0x1.1111266d28935p-7,
.ln2_hi = 0x1.62e42fefa3800p-1,
.ln2_lo = 0x1.ef35793c76730p-45,
/* 1/ln2. */
@@ -26,7 +29,6 @@ static const struct data
.thres = 704.0,
};
-#define C(i) sv_f64 (d->poly[i])
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
@@ -46,20 +48,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
svuint64_t b
= svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
- /* Set s1 to generate overflow depending on sign of exponent n. */
- svfloat64_t s1 = svreinterpret_f64 (
- svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */
- /* Offset s to avoid overflow in final result if n is below threshold. */
+ /* Set s1 to generate overflow depending on sign of exponent n,
+ ie. s1 = 0x70...0 - b. */
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold.
+ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
svfloat64_t s2 = svreinterpret_f64 (
- svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
- b)); /* as_u64 (s) - 0x3010...0 + b. */
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
/* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt (pg, n, 1280.0);
- svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
- svfloat64_t r0 = svmul_x (pg, r2, s1);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
return svsel (p_cmp, r1, r0);
}
@@ -93,16 +95,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
svuint64_t u = svreinterpret_u64 (z);
svfloat64_t n = svsub_x (pg, z, d->shift);
-
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
/* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */
svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
svfloat64_t r = svmls_lane (x, n, ln2, 0);
r = svmls_lane (r, n, ln2, 1);
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
- svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
svfloat64_t y = svmla_x (pg, r, p04, r2);
@@ -129,9 +131,11 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
return svmla_x (pg, s, s, y);
}
-PL_SIG (SV, D, 1, exp, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_D1 (exp), 1.46)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000)
+TEST_SIG (SV, D, 1, exp, -9.9, 9.9)
+TEST_ULP (SV_NAME_D1 (exp), 1.46)
+TEST_DISABLE_FENV (SV_NAME_D1 (exp))
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_exp10_1u5.c b/math/aarch64/sve/exp10.c
index 519693afcab0..3d6af334e155 100644
--- a/pl/math/sv_exp10_1u5.c
+++ b/math/aarch64/sve/exp10.c
@@ -1,28 +1,30 @@
/*
* Double-precision SVE 10^x function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2025, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define SpecialBound 307.0 /* floor (log10 (2^1023)). */
static const struct data
{
- double poly[5];
+ double c1, c3, c2, c4, c0;
double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
} data = {
/* Coefficients generated using Remez algorithm.
rel error: 0x1.9fcb9b3p-60
abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
max ulp err 0.52 +0.5. */
- .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
- 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
+ .c0 = 0x1.26bb1bbb55516p1,
+ .c1 = 0x1.53524c73cd32ap1,
+ .c2 = 0x1.0470591daeafbp1,
+ .c3 = 0x1.2bd77b1361ef6p0,
+ .c4 = 0x1.142b5d54e9621p-1,
/* 1.5*2^46+1023. This value is further explained below. */
.shift = 0x1.800000000ffc0p+46,
.log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
@@ -60,9 +62,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
/* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
- svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
- svfloat64_t r0 = svmul_x (pg, r2, s1);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
return svsel (p_cmp, r1, r0);
}
@@ -93,11 +95,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
comes at significant performance cost. */
svuint64_t u = svreinterpret_u64 (z);
svfloat64_t scale = svexpa (u);
-
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
/* Approximate exp10(r) using polynomial. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
- sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
+
+ svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
/* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound
multiplication may overflow, so use special case routine. */
@@ -116,7 +121,11 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
return svmla_x (pg, scale, scale, y);
}
-PL_SIG (SV, D, 1, exp10, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_D1 (exp10), 0.52)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, 307, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 307, inf, 1000)
+#if WANT_EXP10_TESTS
+TEST_SIG (SV, D, 1, exp10, -9.9, 9.9)
+TEST_ULP (SV_NAME_D1 (exp10), 0.52)
+TEST_DISABLE_FENV (SV_NAME_D1 (exp10))
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, SpecialBound, 10000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), SpecialBound, inf, 1000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/exp10f.c b/math/aarch64/sve/exp10f.c
new file mode 100644
index 000000000000..8679df87702f
--- /dev/null
+++ b/math/aarch64/sve/exp10f.c
@@ -0,0 +1,101 @@
+/*
+ * Single-precision SVE 10^x function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
+
+/* For x < -Thres, the result is subnormal and not handled correctly by
+ FEXPA. */
+#define Thres 37.9
+
+static const struct data
+{
+ float log2_10_lo, c0, c2, c4;
+ float c1, c3, log10_2;
+ float shift, log2_10_hi, thres;
+} data = {
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error.
+ rel error: 0x1.89dafa3p-24
+ abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+ maxerr: 0.52 +0.5 ulp. */
+ .c0 = 0x1.26bb16p+1f,
+ .c1 = 0x1.5350d2p+1f,
+ .c2 = 0x1.04744ap+1f,
+ .c3 = 0x1.2d8176p+0f,
+ .c4 = 0x1.12b41ap-1f,
+ /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
+ .shift = 0x1.803f8p17f,
+ .log10_2 = 0x1.a934fp+1,
+ .log2_10_hi = 0x1.344136p-2,
+ .log2_10_lo = -0x1.ec10cp-27,
+ .thres = Thres,
+};
+
+static inline svfloat32_t
+sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+{
+ /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
+ with poly(r) in [1/sqrt(2), sqrt(2)] and
+ x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
+
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+
+ /* n = round(x/(log10(2)/N)). */
+ svfloat32_t shift = sv_f32 (d->shift);
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+
+ /* r = x - n*log10(2)/N. */
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+ r = svmls_lane (r, n, lane_consts, 0);
+
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+
+ /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+ return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
+ special);
+}
+
+/* Single-precision SVE exp10f routine. Implements the same algorithm
+ as AdvSIMD exp10f.
+ Worst case error is 1.02 ULPs.
+ _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+ want 0x1.ba5f9cp-1. */
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->thres);
+ if (unlikely (svptest_any (special, special)))
+ return special_case (x, special, d);
+ return sv_exp10f_inline (x, pg, d);
+}
+
+#if WANT_EXP10_TESTS
+TEST_SIG (SV, F, 1, exp10, -9.9, 9.9)
+TEST_ULP (SV_NAME_F1 (exp10), 0.52)
+TEST_DISABLE_FENV (SV_NAME_F1 (exp10))
+TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, Thres, 50000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), Thres, inf, 50000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_exp2_2u.c b/math/aarch64/sve/exp2.c
index dcbca8adddd1..adbe40c648ac 100644
--- a/pl/math/sv_exp2_2u.c
+++ b/math/aarch64/sve/exp2.c
@@ -1,14 +1,13 @@
/*
* Double-precision SVE 2^x function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2025, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define N (1 << V_EXP_TABLE_BITS)
@@ -17,15 +16,15 @@
static const struct data
{
- double poly[4];
+ double c0, c2;
+ double c1, c3;
double shift, big_bound, uoflow_bound;
} data = {
/* Coefficients are computed using Remez algorithm with
minimisation of the absolute error. */
- .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
- 0x1.3b2abf5571ad8p-7 },
- .shift = 0x1.8p52 / N,
- .uoflow_bound = UOFlowBound,
+ .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
+ .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
+ .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,
.big_bound = BigBound,
};
@@ -57,9 +56,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
/* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
- svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
- svfloat64_t r0 = svmul_x (pg, r2, s1);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
return svsel (p_cmp, r1, r0);
}
@@ -89,19 +88,24 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
/* Approximate exp2(r) using polynomial. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
- svfloat64_t y = svmul_x (pg, r, p);
-
+ /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+ svfloat64_t p = svmla_x (pg, p01, p23, r2);
+ svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
/* Assemble exp2(x) = exp2(r) * scale. */
if (unlikely (svptest_any (pg, special)))
return special_case (pg, scale, y, kd, d);
return svmla_x (pg, scale, scale, y);
}
-PL_SIG (SV, D, 1, exp2, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_D1 (exp2), 1.15)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000)
+TEST_SIG (SV, D, 1, exp2, -9.9, 9.9)
+TEST_ULP (SV_NAME_D1 (exp2), 1.15)
+TEST_DISABLE_FENV (SV_NAME_D1 (exp2))
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/exp2f.c b/math/aarch64/sve/exp2f.c
new file mode 100644
index 000000000000..f4c1d0ae607e
--- /dev/null
+++ b/math/aarch64/sve/exp2f.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision SVE 2^x function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define Thres 0x1.5d5e2ap+6f
+
+static const struct data
+{
+ float c0, c2, c4, c1, c3;
+ float shift, thres;
+} data = {
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */
+ .c0 = 0x1.62e422p-1f,
+ .c1 = 0x1.ebf9bcp-3f,
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = 0x1.3ce9e4p-7f,
+ .c4 = 0x1.59977ap-10f,
+ /* 1.5*2^17 + 127. */
+ .shift = 0x1.803f8p17f,
+ /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+ correctly by FEXPA. */
+ .thres = Thres,
+};
+
+static inline svfloat32_t
+sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+{
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
+ svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
+
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+
+ /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
+ Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
+ coefficients 1 to 4, and apply most significant coefficient directly. */
+ svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+ svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+ svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+ return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
+ special);
+}
+
+/* Single-precision SVE exp2f routine. Implements the same algorithm
+ as AdvSIMD exp2f.
+ Worst case error is 1.04 ULPs.
+ _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+ want 0x1.ba6a64p-1. */
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->thres);
+ if (unlikely (svptest_any (special, special)))
+ return special_case (x, special, d);
+ return sv_exp2f_inline (x, pg, d);
+}
+
+TEST_SIG (SV, F, 1, exp2, -9.9, 9.9)
+TEST_ULP (SV_NAME_F1 (exp2), 0.54)
+TEST_DISABLE_FENV (SV_NAME_F1 (exp2))
+TEST_SYM_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 50000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/expf.c b/math/aarch64/sve/expf.c
new file mode 100644
index 000000000000..11528abdbbaf
--- /dev/null
+++ b/math/aarch64/sve/expf.c
@@ -0,0 +1,50 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_expf_inline.h"
+
+/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+ correctly by FEXPA. */
+#define Thres 0x1.5d5e2ap+6f
+
+static const struct data
+{
+ struct sv_expf_data d;
+ float thres;
+} data = {
+ .d = SV_EXPF_DATA,
+ .thres = Thres,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
+{
+ return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
+}
+
+/* Optimised single-precision SVE exp function.
+ Worst-case error is 1.04 ulp:
+ SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+ want 0x1.ba74bap+4. */
+svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t is_special_case = svacgt (pg, x, d->thres);
+ if (unlikely (svptest_any (pg, is_special_case)))
+ return special_case (x, is_special_case, &d->d);
+ return expf_inline (x, pg, &d->d);
+}
+
+TEST_SIG (SV, F, 1, exp, -9.9, 9.9)
+TEST_ULP (SV_NAME_F1 (exp), 0.55)
+TEST_DISABLE_FENV (SV_NAME_F1 (exp))
+TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, Thres, 50000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (exp), Thres, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_expm1_2u5.c b/math/aarch64/sve/expm1.c
index 82a31f6d9c0e..f4fb8cb982f0 100644
--- a/pl/math/sv_expm1_2u5.c
+++ b/math/aarch64/sve/expm1.c
@@ -1,14 +1,14 @@
/*
* Double-precision vector exp(x) - 1 function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define SpecialBound 0x1.62b7d369a5aa9p+9
#define ExponentBias 0x3ff0000000000000
@@ -88,8 +88,10 @@ svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
return y;
}
-PL_SIG (SV, D, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_D1 (expm1), 1.68)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000)
+TEST_SIG (SV, D, 1, expm1, -9.9, 9.9)
+TEST_ULP (SV_NAME_D1 (expm1), 1.68)
+TEST_DISABLE_FENV (SV_NAME_D1 (expm1))
+TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_expm1f_1u6.c b/math/aarch64/sve/expm1f.c
index 0ec7c00f5300..95f7c09a403d 100644
--- a/pl/math/sv_expm1f_1u6.c
+++ b/math/aarch64/sve/expm1f.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector exp(x) - 1 function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Largest value of x for which expm1(x) should round to -1. */
#define SpecialBound 0x1.5ebc4p+6f
@@ -17,20 +17,17 @@ static const struct data
/* These 4 are grouped together so they can be loaded as one quadword, then
used with _lane forms of svmla/svmls. */
float c2, c4, ln2_hi, ln2_lo;
- float c0, c1, c3, inv_ln2, special_bound, shift;
+ float c0, inv_ln2, c1, c3, special_bound;
} data = {
/* Generated using fpminimax. */
.c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
.c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
- .c4 = 0x1.6b55a2p-10,
+ .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f,
+ .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f,
+ .ln2_hi = 0x1.62e4p-1f,
- .special_bound = SpecialBound, .shift = 0x1.8p23f,
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
- .ln2_lo = 0x1.7f7d1cp-20f,
};
-#define C(i) sv_f32 (d->c##i)
-
static svfloat32_t NOINLINE
special_case (svfloat32_t x, svbool_t pg)
{
@@ -60,9 +57,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
- j = svsub_x (pg, j, d->shift);
- svint32_t i = svcvt_s32_x (pg, j);
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
+ j = svrinta_x (pg, j);
svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
f = svmls_lane (f, j, lane_constants, 3);
@@ -72,22 +68,24 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
- svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
svfloat32_t p = svmla_x (pg, p12, f2, p34);
- p = svmla_x (pg, C (0), f, p);
+
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
p = svmla_x (pg, f, f2, p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
- svfloat32_t t = svreinterpret_f32 (
- svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
}
-PL_SIG (SV, F, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (expm1), 1.02)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000)
+TEST_SIG (SV, F, 1, expm1, -9.9, 9.9)
+TEST_ULP (SV_NAME_F1 (expm1), 1.02)
+TEST_DISABLE_FENV (SV_NAME_F1 (expm1))
+TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_hypot_1u5.c b/math/aarch64/sve/hypot.c
index cf1590e4b9ab..2ed298623acc 100644
--- a/pl/math/sv_hypot_1u5.c
+++ b/math/aarch64/sve/hypot.c
@@ -1,13 +1,13 @@
/*
* Double-precision SVE hypot(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -43,9 +43,11 @@ svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg)
return svsqrt_x (pg, sqsum);
}
-PL_SIG (SV, D, 2, hypot, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_D2 (hypot), 0.71)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
+TEST_SIG (SV, D, 2, hypot, -10.0, 10.0)
+TEST_ULP (SV_NAME_D2 (hypot), 0.71)
+TEST_DISABLE_FENV (SV_NAME_D2 (hypot))
+TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_hypotf_1u5.c b/math/aarch64/sve/hypotf.c
index f428832b3dbc..b977b998986b 100644
--- a/pl/math/sv_hypotf_1u5.c
+++ b/math/aarch64/sve/hypotf.c
@@ -1,13 +1,13 @@
/*
* Single-precision SVE hypot(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
#define TinyBound 0x0c800000 /* asuint (0x1p-102). */
#define Thres 0x73000000 /* 0x70000000 - TinyBound. */
@@ -37,9 +37,11 @@ svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y,
return svsqrt_x (pg, sqsum);
}
-PL_SIG (SV, F, 2, hypot, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_F2 (hypot), 0.71)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
+TEST_SIG (SV, F, 2, hypot, -10.0, 10.0)
+TEST_ULP (SV_NAME_F2 (hypot), 0.71)
+TEST_DISABLE_FENV (SV_NAME_F2 (hypot))
+TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/log.c b/math/aarch64/sve/log.c
new file mode 100644
index 000000000000..c612df48c1fd
--- /dev/null
+++ b/math/aarch64/sve/log.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision SVE log(x) function.
+ *
+ * Copyright (c) 2020-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define N (1 << V_LOG_TABLE_BITS)
+#define Max (0x7ff0000000000000)
+#define Min (0x0010000000000000)
+#define Thresh (0x7fe0000000000000) /* Max - Min. */
+
+static const struct data
+{
+ double c0, c2;
+ double c1, c3;
+ double ln2, c4;
+ uint64_t off;
+} data = {
+ .c0 = -0x1.ffffffffffff7p-2,
+ .c1 = 0x1.55555555170d4p-2,
+ .c2 = -0x1.0000000399c27p-2,
+ .c3 = 0x1.999b2e90e94cap-3,
+ .c4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .off = 0x3fe6900900000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
+ svbool_t special, const struct data *d)
+{
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
+ return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
+}
+
+/* Double-precision SVE log routine.
+ Maximum measured error is 2.64 ulp:
+ SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6
+ want 0x1.fffffffe88cafp+6. */
+svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
+ /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
+ The actual value of i is double this due to table layout. */
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svfloat64_t z = svreinterpret_f64 (iz);
+ /* Lookup in 2 global lists (length N). */
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+ /* hi = r + log(c) + k*Ln2. */
+ svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2);
+ svfloat64_t r = svmad_x (pg, invc, z, -1);
+ svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0);
+ hi = svadd_x (pg, r, hi);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
+ y = svmla_lane_f64 (y, r2, ln2_and_c4, 1);
+ y = svmla_x (pg, p, r2, y);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (hi, tmp, y, r2, special, d);
+ return svmla_x (pg, hi, r2, y);
+}
+
+TEST_SIG (SV, D, 1, log, 0.01, 11.1)
+TEST_ULP (SV_NAME_D1 (log), 2.15)
+TEST_DISABLE_FENV (SV_NAME_D1 (log))
+TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000)
+TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000)
+TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/log10.c b/math/aarch64/sve/log10.c
new file mode 100644
index 000000000000..5af142d79f55
--- /dev/null
+++ b/math/aarch64/sve/log10.c
@@ -0,0 +1,101 @@
+/*
+ * Double-precision SVE log10(x) function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define Min 0x0010000000000000
+#define Max 0x7ff0000000000000
+#define Thres 0x7fe0000000000000 /* Max - Min. */
+#define N (1 << V_LOG10_TABLE_BITS)
+
+static const struct data
+{
+ double c0, c2;
+ double c1, c3;
+ double invln10, log10_2;
+ double c4;
+ uint64_t off;
+} data = {
+ .c0 = -0x1.bcb7b1526e506p-3,
+ .c1 = 0x1.287a7636be1d1p-3,
+ .c2 = -0x1.bcb7b158af938p-4,
+ .c3 = 0x1.63c78734e6d07p-4,
+ .c4 = -0x1.287461742fee4p-4,
+ .invln10 = 0x1.bcb7b1526e50ep-2,
+ .log10_2 = 0x1.34413509f79ffp-2,
+ .off = 0x3fe6900900000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
+ svbool_t special, const struct data *d)
+{
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
+ return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
+}
+
+/* Double-precision SVE log10 routine.
+ Maximum measured error is 2.46 ulps.
+ SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
+ want 0x1.fffbdf6eaa667p-6. */
+svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
+ svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
+ i = svand_x (pg, i, (N - 1) << 1);
+ svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+ svfloat64_t z = svreinterpret_f64 (
+ svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)));
+
+ /* log(x) = k*log(2) + log(c) + log(z/c). */
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i);
+ svfloat64_t logc
+ = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i);
+
+ /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1):
+ r = z/c - 1 (we look up precomputed 1/c)
+ log(z/c) ~= P(r). */
+ svfloat64_t r = svmad_x (pg, invc, z, -1.0);
+
+ /* hi = log(c) + k*log(2). */
+ svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10);
+ svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0);
+ svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
+ y = svmla_x (pg, y, r2, d->c4);
+ y = svmla_x (pg, p, r2, y);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (hi, tmp, y, r2, special, d);
+ return svmla_x (pg, hi, r2, y);
+}
+
+TEST_SIG (SV, D, 1, log10, 0.01, 11.1)
+TEST_ULP (SV_NAME_D1 (log10), 1.97)
+TEST_DISABLE_FENV (SV_NAME_D1 (log10))
+TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100)
+TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_log10f_3u5.c b/math/aarch64/sve/log10f.c
index a685b23e5de5..6c3add451761 100644
--- a/pl/math/sv_log10f_3u5.c
+++ b/math/aarch64/sve/log10f.c
@@ -1,19 +1,20 @@
/*
* Single-precision SVE log10 function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float poly_0246[4];
float poly_1357[4];
float ln2, inv_ln10;
+ uint32_t off, lower;
} data = {
.poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -25,18 +26,23 @@ static const struct data
-0x1.0fc92cp-4f },
.ln2 = 0x1.62e43p-1f,
.inv_ln10 = 0x1.bcb7b2p-2f,
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min 0x00800000
-#define Max 0x7f800000
-#define Thres 0x7f000000 /* Max - Min. */
-#define Offset 0x3f2aaaab /* 0.666667. */
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
#define MantissaMask 0x007fffff
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (log10f, x, y, special);
+ return sv_call_f32 (
+ log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE log10f using the same algorithm and
@@ -47,23 +53,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t ix = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- ix = svsub_x (pg, ix, Offset);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
- ix = svand_x (pg, ix, MantissaMask);
- ix = svadd_x (pg, ix, Offset);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
+ svuint32_t ix = svand_x (pg, u_off, MantissaMask);
+ ix = svadd_x (pg, ix, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
/* y = log10(1+r) + n*log10(2)
log10(1+r) ~ r * InvLn(10) + P(r)
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t r4 = svmul_x (pg, r2, r2);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
@@ -78,16 +86,17 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
hi = svmul_x (pg, hi, d->inv_ln10);
if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
- special);
- return svmla_x (pg, hi, r2, y);
+ return special_case (u_off, hi, r2, y, special);
+ return svmla_x (svptrue_b32 (), hi, r2, y);
}
-PL_SIG (SV, F, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (SV_NAME_F1 (log10), 2.82)
-PL_TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000)
+TEST_SIG (SV, F, 1, log10, 0.01, 11.1)
+TEST_ULP (SV_NAME_F1 (log10), 2.82)
+TEST_DISABLE_FENV (SV_NAME_F1 (log10))
+TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100)
+TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_log1p_2u5.c b/math/aarch64/sve/log1p.c
index f178ab16238a..e6b895b52908 100644
--- a/pl/math/sv_log1p_2u5.c
+++ b/math/aarch64/sve/log1p.c
@@ -1,14 +1,14 @@
/*
* Double-precision SVE log(1+x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -107,10 +107,12 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
return y;
}
-PL_SIG (SV, D, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (log1p), 1.97)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10)
+TEST_SIG (SV, D, 1, log1p, -0.9, 10.0)
+TEST_ULP (SV_NAME_D1 (log1p), 1.97)
+TEST_DISABLE_FENV (SV_NAME_D1 (log1p))
+TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000)
+TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/log1pf.c b/math/aarch64/sve/log1pf.c
new file mode 100644
index 000000000000..77ae6218f931
--- /dev/null
+++ b/math/aarch64/sve/log1pf.c
@@ -0,0 +1,43 @@
+/*
+ * Single-precision vector log(x + 1) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_log1pf_inline.h"
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special)
+{
+ return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
+ special);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.27 ULP very close to 0.5.
+ _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2
+ want 0x1.9f323ep-2. */
+svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+{
+ /* x < -1, Inf/Nan. */
+ svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
+ special = svorn_z (pg, special, svcmpge (pg, x, -1));
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, special);
+
+ return sv_log1pf_inline (x, pg);
+}
+
+TEST_SIG (SV, F, 1, log1p, -0.9, 10.0)
+TEST_ULP (SV_NAME_F1 (log1p), 0.77)
+TEST_DISABLE_FENV (SV_NAME_F1 (log1p))
+TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000)
+TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000)
+TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/log2.c b/math/aarch64/sve/log2.c
new file mode 100644
index 000000000000..11c65c1b2963
--- /dev/null
+++ b/math/aarch64/sve/log2.c
@@ -0,0 +1,96 @@
+/*
+ * Double-precision SVE log2 function.
+ *
+ * Copyright (c) 2022-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+#define N (1 << V_LOG2_TABLE_BITS)
+#define Max (0x7ff0000000000000)
+#define Min (0x0010000000000000)
+#define Thresh (0x7fe0000000000000) /* Max - Min. */
+
+static const struct data
+{
+ double c0, c2;
+ double c1, c3;
+ double invln2, c4;
+ uint64_t off;
+} data = {
+ .c0 = -0x1.71547652b83p-1,
+ .c1 = 0x1.ec709dc340953p-2,
+ .c2 = -0x1.71547651c8f35p-2,
+ .c3 = 0x1.2777ebe12dda5p-2,
+ .c4 = -0x1.ec738d616fe26p-3,
+ .invln2 = 0x1.71547652b82fep0,
+ .off = 0x3fe6900900000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
+ svbool_t special, const struct data *d)
+{
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
+ return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special);
+}
+
+/* Double-precision SVE log2 routine.
+ Implements the same algorithm as AdvSIMD log10, with coefficients and table
+ entries scaled in extended precision.
+ The maximum observed error is 2.58 ULP:
+ SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+ want 0x1.fffb34198d9ddp-5. */
+svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
+ svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
+ i = svand_x (pg, i, (N - 1) << 1);
+ svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+ svfloat64_t z = svreinterpret_f64 (
+ svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)));
+
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i);
+ svfloat64_t log2c
+ = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i);
+
+ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
+
+ svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2);
+ svfloat64_t r = svmad_x (pg, invc, z, -1.0);
+ svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0);
+ w = svadd_x (pg, k, w);
+
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
+ y = svmla_lane_f64 (y, r2, invln2_and_c4, 1);
+ y = svmla_x (pg, p, r2, y);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (w, tmp, y, r2, special, d);
+ return svmla_x (pg, w, r2, y);
+}
+
+TEST_SIG (SV, D, 1, log2, 0.01, 11.1)
+TEST_ULP (SV_NAME_D1 (log2), 2.09)
+TEST_DISABLE_FENV (SV_NAME_D1 (log2))
+TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000)
+TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000)
+TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_log2f_2u5.c b/math/aarch64/sve/log2f.c
index 9e96c62bbcc6..312fd448226b 100644
--- a/pl/math/sv_log2f_2u5.c
+++ b/math/aarch64/sve/log2f.c
@@ -1,18 +1,19 @@
/*
* Single-precision vector/SVE log2 function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float poly_02468[5];
float poly_1357[4];
+ uint32_t off, lower;
} data = {
.poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -22,18 +23,23 @@ static const struct data
},
.poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
0x1.9d8ecap-3f, 0x1.9e495p-3f },
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thres (0x7f000000) /* Max - Min. */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define MantissaMask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (log2f, x, y, cmp);
+ return sv_call_f32 (
+ log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE log2f, using the same algorithm
@@ -45,19 +51,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
- u = svand_x (pg, u, MantissaMask);
- u = svadd_x (pg, u, Off);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
+ svuint32_t u = svand_x (pg, u_off, MantissaMask);
+ u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log2(1+r) + n. */
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* Evaluate polynomial using pairwise Horner scheme. */
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
@@ -71,16 +78,17 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
y = svmla_x (pg, q_01, r2, y);
if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
- return svmla_x (pg, n, r, y);
+ return special_case (u_off, n, r, y, special);
+ return svmla_x (svptrue_b32 (), n, r, y);
}
-PL_SIG (SV, F, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (SV_NAME_F1 (log2), 1.99)
-PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_F1 (log2))
-PL_TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000)
+TEST_SIG (SV, F, 1, log2, 0.01, 11.1)
+TEST_ULP (SV_NAME_F1 (log2), 1.99)
+TEST_DISABLE_FENV (SV_NAME_F1 (log2))
+TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000)
+TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000)
+TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_logf_3u4.c b/math/aarch64/sve/logf.c
index 967355247036..2898e36974d6 100644
--- a/pl/math/sv_logf_3u4.c
+++ b/math/aarch64/sve/logf.c
@@ -1,19 +1,20 @@
/*
* Single-precision vector log function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
float poly_0135[4];
float poly_246[3];
float ln2;
+ uint32_t off, lower;
} data = {
.poly_0135 = {
/* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
@@ -22,21 +23,24 @@ static const struct data
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
},
.poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
- .ln2 = 0x1.62e43p-1f
+ .ln2 = 0x1.62e43p-1f,
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thresh (0x7f000000) /* Max - Min. */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define Mask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667. */
-
-float optr_aor_log_f32 (float);
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (optr_aor_log_f32, x, y, cmp);
+ return sv_call_f32 (
+ logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE logf, using the same algorithm and
@@ -47,19 +51,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32 (x);
- svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
- u = svand_x (pg, u, Mask);
- u = svadd_x (pg, u, Off);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
+
+ svuint32_t u = svand_x (pg, u_off, Mask);
+ u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log(1+r) + n*ln2. */
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
@@ -72,15 +78,17 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
p = svmla_x (pg, r, n, d->ln2);
if (unlikely (svptest_any (pg, cmp)))
- return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+ return special_case (u_off, p, r2, y, cmp);
return svmla_x (pg, p, r2, y);
}
-PL_SIG (SV, F, 1, log, 0.01, 11.1)
-PL_TEST_ULP (SV_NAME_F1 (log), 2.85)
-PL_TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100)
-PL_TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100)
-PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000)
+TEST_SIG (SV, F, 1, log, 0.01, 11.1)
+TEST_ULP (SV_NAME_F1 (log), 2.85)
+TEST_DISABLE_FENV (SV_NAME_F1 (log))
+TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100)
+TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100)
+TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000)
+TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/modf.c b/math/aarch64/sve/modf.c
new file mode 100644
index 000000000000..5944c7d37c4c
--- /dev/null
+++ b/math/aarch64/sve/modf.c
@@ -0,0 +1,36 @@
+/*
+ * Double-precision SVE modf(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modf algorithm. Produces exact values in all rounding modes. */
+svfloat64_t SV_NAME_D1_L1 (modf) (svfloat64_t x, double *out_int,
+ const svbool_t pg)
+{
+ /* Get integer component of x. */
+ svfloat64_t fint_comp = svrintz_x (pg, x);
+
+ svst1_f64 (pg, out_int, fint_comp);
+
+ /* Subtract integer component from input. */
+ svfloat64_t remaining = svsub_f64_x (svptrue_b64 (), x, fint_comp);
+
+ /* Return +0 for integer x. */
+ svbool_t is_integer = svcmpeq (pg, x, fint_comp);
+ return svsel (is_integer, sv_f64 (0), remaining);
+}
+
+TEST_ULP (_ZGVsMxvl8_modf_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVsMxvl8_modf_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_int, 1, inf, 20000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/modff.c b/math/aarch64/sve/modff.c
new file mode 100644
index 000000000000..ad7ce4e2c88f
--- /dev/null
+++ b/math/aarch64/sve/modff.c
@@ -0,0 +1,36 @@
+/*
+ * Single-precision SVE modff(x, *y) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+/* Modff algorithm. Produces exact values in all rounding modes. */
+svfloat32_t SV_NAME_F1_L1 (modf) (svfloat32_t x, float *out_int,
+ const svbool_t pg)
+{
+ /* Get integer component of x. */
+ svfloat32_t fint_comp = svrintz_x (pg, x);
+
+ svst1_f32 (pg, out_int, fint_comp);
+
+ /* Subtract integer component from input. */
+ svfloat32_t remaining = svsub_f32_x (svptrue_b32 (), x, fint_comp);
+
+ /* Return +0 for integer x. */
+ svbool_t is_integer = svcmpeq (pg, x, fint_comp);
+ return svsel (is_integer, sv_f32 (0), remaining);
+}
+
+TEST_ULP (_ZGVsMxvl4_modff_frac, 0.0)
+TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_frac, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_frac, 1, inf, 20000)
+
+TEST_ULP (_ZGVsMxvl4_modff_int, 0.0)
+TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_int, 0, 1, 20000)
+TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_int, 1, inf, 20000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_pow_1u5.c b/math/aarch64/sve/pow.c
index 0838810206a1..12b2fb42b2cb 100644
--- a/pl/math/sv_pow_1u5.c
+++ b/math/aarch64/sve/pow.c
@@ -1,13 +1,13 @@
/*
* Double-precision SVE pow(x, y) function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2025, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* This version share a similar algorithm as AOR scalar pow.
@@ -23,8 +23,8 @@
The SVE algorithm drops the tail in the exp computation at the price of
a lower accuracy, slightly above 1ULP.
The SVE algorithm also drops the special treatement of small (< 2^-65) and
- large (> 2^63) finite values of |y|, as they only affect non-round to nearest
- modes.
+ large (> 2^63) finite values of |y|, as they only affect non-round to
+ nearest modes.
Maximum measured error is 1.04 ULPs:
SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
@@ -33,19 +33,18 @@
/* Data is defined in v_pow_log_data.c. */
#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
-#define A __v_pow_log_data.poly
#define Off 0x3fe6955500000000
/* Data is defined in v_pow_exp_data.c. */
#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
-#define C __v_pow_exp_data.poly
#define SmallExp 0x3c9 /* top12(0x1p-54). */
#define BigExp 0x408 /* top12(512.). */
#define ThresExp 0x03f /* BigExp - SmallExp. */
#define HugeExp 0x409 /* top12(1024.). */
/* Constants associated with pow. */
+#define SmallBoundX 0x1p-126
#define SmallPowX 0x001 /* top12(0x1p-126). */
#define BigPowX 0x7ff /* top12(INFINITY). */
#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
@@ -53,6 +52,31 @@
#define BigPowY 0x43e /* top12(0x1.749p62). */
#define ThresPowY 0x080 /* BigPowY - SmallPowY. */
+static const struct data
+{
+ double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
+ double log_c1, log_c3, log_c5, off;
+ double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
+ double exp_c0, exp_c1;
+} data = {
+ .log_c0 = -0x1p-1,
+ .log_c1 = -0x1.555555555556p-1,
+ .log_c2 = 0x1.0000000000006p-1,
+ .log_c3 = 0x1.999999959554ep-1,
+ .log_c4 = -0x1.555555529a47ap-1,
+ .log_c5 = -0x1.2495b9b4845e9p0,
+ .log_c6 = 0x1.0002b8b263fc3p0,
+ .off = Off,
+ .exp_c0 = 0x1.fffffffffffd4p-2,
+ .exp_c1 = 0x1.5555571d6ef9p-3,
+ .exp_c2 = 0x1.5555576a5adcep-5,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
+ .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
+ .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
+};
+
/* Check if x is an integer. */
static inline svbool_t
sv_isint (svbool_t pg, svfloat64_t x)
@@ -71,7 +95,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
static inline svbool_t
sv_isodd (svbool_t pg, svfloat64_t x)
{
- svfloat64_t y = svmul_x (pg, x, 0.5);
+ svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
return sv_isnotint (pg, y);
}
@@ -110,7 +134,7 @@ zeroinfnan (uint64_t i)
static inline svbool_t
sv_zeroinfnan (svbool_t pg, svuint64_t i)
{
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
2 * asuint64 (INFINITY) - 1);
}
@@ -163,23 +187,24 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
additional 15 bits precision. IX is the bit representation of x, but
normalized in the subnormal range using the sign bit for the exponent. */
static inline svfloat64_t
-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
+ const struct data *d)
{
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
sv_u64 (N_LOG - 1));
svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
+ svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
svfloat64_t z = svreinterpret_f64 (iz);
svfloat64_t kd = svcvt_f64_x (pg, k);
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
/* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
- that uses array of structures. We also do the lookup earlier in the code to
- make sure it finishes as early as possible. */
+ that uses array of structures. We also do the lookup earlier in the code
+ to make sure it finishes as early as possible. */
svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
@@ -188,40 +213,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
svfloat64_t r = svmad_x (pg, z, invc, -1.0);
/* k*Ln2 + log(c) + r. */
- svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
+
+ svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
svfloat64_t t2 = svadd_x (pg, t1, r);
- svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
+ svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
/* Evaluation is optimized assuming superscalar pipelined execution. */
- svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */
- svfloat64_t ar2 = svmul_x (pg, r, ar);
- svfloat64_t ar3 = svmul_x (pg, r, ar2);
+
+ svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
+ svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
+ svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
+ svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
/* k*Ln2 + log(c) + r + A[0]*r*r. */
svfloat64_t hi = svadd_x (pg, t2, ar2);
- svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
+ svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
/* p = log1p(r) - r - A[0]*r*r. */
/* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
A[6])))). */
- svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
- svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
- svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
+
+ svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
+ svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
+ svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
+ svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
svfloat64_t p = svmla_x (pg, a34, ar2, a56);
p = svmla_x (pg, a12, ar2, p);
- p = svmul_x (pg, ar3, p);
+ p = svmul_x (svptrue_b64 (), ar3, p);
svfloat64_t lo = svadd_x (
- pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+ pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
svfloat64_t y = svadd_x (pg, hi, lo);
*tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
return y;
}
+static inline svfloat64_t
+sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+ svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
+ svuint64_t *ki, const struct data *d)
+{
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
+ svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
+ svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
+ svfloat64_t kd = svrinta_x (pg, z);
+ *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
+
+ svfloat64_t ln2_over_n_hilo
+ = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
+ svfloat64_t r = x;
+ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
+ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r = svadd_x (pg, r, xtail);
+ /* 2^(k/N) ~= scale. */
+ svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
+ svuint64_t top
+ = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+ *sbits = svadd_x (pg, *sbits, top);
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
+ *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
+ *tmp = svmla_x (pg, r, r2, *tmp);
+ svfloat64_t scale = svreinterpret_f64 (*sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ z = svmla_x (pg, scale, scale, *tmp);
+ return z;
+}
+
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
static inline svfloat64_t
sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
- svuint64_t sign_bias)
+ svuint64_t sign_bias, const struct data *d)
{
/* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
and other cases of large values of x (scale * (1 + TMP) oflow). */
@@ -229,73 +299,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
/* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
- /* Conditions special, uflow and oflow are all expressed as uoflow &&
- something, hence do not bother computing anything if no lane in uoflow is
- true. */
- svbool_t special = svpfalse_b ();
- svbool_t uflow = svpfalse_b ();
- svbool_t oflow = svpfalse_b ();
+ svfloat64_t tmp;
+ svuint64_t sbits, ki;
if (unlikely (svptest_any (pg, uoflow)))
{
+ svfloat64_t z
+ = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
+
/* |x| is tiny (|x| <= 0x1p-54). */
- uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+ svbool_t uflow
+ = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
uflow = svand_z (pg, uoflow, uflow);
/* |x| is huge (|x| >= 1024). */
- oflow = svcmpge (pg, abstop, HugeExp);
+ svbool_t oflow = svcmpge (pg, abstop, HugeExp);
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
+
/* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
- or underflow. */
- special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+ or underflow. */
+ svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+
+ /* Update result with special and large cases. */
+ z = sv_call_specialcase (tmp, sbits, ki, z, special);
+
+ /* Handle underflow and overflow. */
+ svbool_t x_is_neg = svcmplt (pg, x, 0);
+ svuint64_t sign_mask
+ = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+ svfloat64_t res_uoflow
+ = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+ res_uoflow = svreinterpret_f64 (
+ svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+ /* Avoid spurious underflow for tiny x. */
+ svfloat64_t res_spurious_uflow
+ = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+
+ z = svsel (oflow, res_uoflow, z);
+ z = svsel (uflow, res_spurious_uflow, z);
+ return z;
}
- /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
- /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
- svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
- /* z - kd is in [-1, 1] in non-nearest rounding modes. */
- svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
- svfloat64_t kd = svadd_x (pg, z, shift);
- svuint64_t ki = svreinterpret_u64 (kd);
- kd = svsub_x (pg, kd, shift);
- svfloat64_t r = x;
- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
- /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
- r = svadd_x (pg, r, xtail);
- /* 2^(k/N) ~= scale. */
- svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
- svuint64_t top
- = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
- /* This is only a valid scale when -1023*N < k < 1024*N. */
- svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
- sbits = svadd_x (pg, sbits, top);
- /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
- tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
- tmp = svmla_x (pg, r, r2, tmp);
- svfloat64_t scale = svreinterpret_f64 (sbits);
- /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
- is no spurious underflow here even without fma. */
- z = svmla_x (pg, scale, scale, tmp);
-
- /* Update result with special and large cases. */
- if (unlikely (svptest_any (pg, special)))
- z = sv_call_specialcase (tmp, sbits, ki, z, special);
-
- /* Handle underflow and overflow. */
- svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
- svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
- svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
- svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
- res_uoflow = svreinterpret_f64 (
- svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
- z = svsel (oflow, res_uoflow, z);
- /* Avoid spurious underflow for tiny x. */
- svfloat64_t res_spurious_uflow
- = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
- z = svsel (uflow, res_spurious_uflow, z);
-
- return z;
+ return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
}
static inline double
@@ -323,56 +366,46 @@ pow_sc (double x, double y)
double_t x2 = x * x;
if (ix >> 63 && checkint (iy) == 1)
x2 = -x2;
- /* Without the barrier some versions of clang hoist the 1/x2 and
- thus division by zero exception can be signaled spuriously. */
- return (iy >> 63) ? opt_barrier_double (1 / x2) : x2;
+ return (iy >> 63) ? 1 / x2 : x2;
}
return x;
}
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
/* This preamble handles special case conditions used in the final scalar
fallbacks. It also updates ix and sign_bias, that are used in the core
computation too, i.e., exp( y * log (x) ). */
svuint64_t vix0 = svreinterpret_u64 (x);
svuint64_t viy0 = svreinterpret_u64 (y);
- svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
/* Negative x cases. */
- svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
- svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
+ svbool_t xisneg = svcmplt (pg, x, 0);
/* Set sign_bias and ix depending on sign of x and nature of y. */
- svbool_t yisnotint_xisneg = svpfalse_b ();
+ svbool_t yint_or_xpos = pg;
svuint64_t sign_bias = sv_u64 (0);
svuint64_t vix = vix0;
- svuint64_t vtopx1 = vtopx0;
if (unlikely (svptest_any (pg, xisneg)))
{
/* Determine nature of y. */
- yisnotint_xisneg = sv_isnotint (xisneg, y);
- svbool_t yisint_xisneg = sv_isint (xisneg, y);
+ yint_or_xpos = sv_isint (xisneg, y);
svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
/* ix set to abs(ix) if y is integer. */
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
- vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
/* Set to SignBias if x is negative and y is odd. */
sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
}
- /* Special cases of x or y: zero, inf and nan. */
- svbool_t xspecial = sv_zeroinfnan (pg, vix0);
- svbool_t yspecial = sv_zeroinfnan (pg, viy0);
- svbool_t special = svorr_z (pg, xspecial, yspecial);
-
/* Small cases of x: |x| < 0x1p-126. */
- svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
- svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
- if (unlikely (svptest_any (pg, xsmall)))
+ svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
+ if (unlikely (svptest_any (yint_or_xpos, xsmall)))
{
/* Normalize subnormal x so exponent becomes negative. */
- svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
+ svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
+ svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
@@ -382,33 +415,38 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
/* y_hi = log(ix, &y_lo). */
svfloat64_t vlo;
- svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
+ svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
/* z = exp(y_hi, y_lo, sign_bias). */
- svfloat64_t vehi = svmul_x (pg, y, vhi);
- svfloat64_t velo = svmul_x (pg, y, vlo);
- svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
- velo = svsub_x (pg, velo, vemi);
- svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
+ svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
+ svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
+ svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
+ svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
/* Cases of finite y and finite negative x. */
- vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
+ vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
+
+ /* Special cases of x or y: zero, inf and nan. */
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
+ svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
/* Cases of zero/inf/nan x or y. */
- if (unlikely (svptest_any (pg, special)))
+ if (unlikely (svptest_any (svptrue_b64 (), special)))
vz = sv_call2_f64 (pow_sc, x, y, vz, special);
return vz;
}
-PL_SIG (SV, D, 2, pow)
-PL_TEST_ULP (SV_NAME_D2 (pow), 0.55)
+TEST_SIG (SV, D, 2, pow)
+TEST_ULP (SV_NAME_D2 (pow), 0.55)
+TEST_DISABLE_FENV (SV_NAME_D2 (pow))
/* Wide intervals spanning the whole domain but shared between x and y. */
-#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
#define EXPAND(str) str##000000000
#define SHL52(str) EXPAND (str)
SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
@@ -426,10 +464,10 @@ SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
/* x is negative, y is odd or even integer, or y is real not integer. */
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
/* |x| is inf, y is odd or even integer, or y is real not integer. */
SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
@@ -438,7 +476,8 @@ SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
/* 0.0^y. */
SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
/* 1.0^y. */
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
-PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_powf_2u6.c b/math/aarch64/sve/powf.c
index 2db0636aea62..8457e83e7495 100644
--- a/pl/math/sv_powf_2u6.c
+++ b/math/aarch64/sve/powf.c
@@ -1,13 +1,13 @@
/*
* Single-precision SVE powf function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2025, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* The following data is used in the SVE pow core computation
and special case detection. */
@@ -15,7 +15,6 @@
#define Tlogc __v_powf_data.logc
#define Texp __v_powf_data.scale
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
-#define Shift 0x1.8p52
#define Norm 0x1p23f /* 0x4b000000. */
/* Overall ULP error bound for pow is 2.6 ulp
@@ -25,7 +24,7 @@ static const struct data
double log_poly[4];
double exp_poly[3];
float uflow_bound, oflow_bound, small_bound;
- uint32_t sign_bias, sign_mask, subnormal_bias, off;
+ uint32_t sign_bias, subnormal_bias, off;
} data = {
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
V_POWF_EXP2_N. */
@@ -42,7 +41,6 @@ static const struct data
.small_bound = 0x1p-126f,
.off = 0x3f35d000,
.sign_bias = SignBias,
- .sign_mask = 0x80000000,
.subnormal_bias = 0x0b800000, /* 23 << 23. */
};
@@ -75,7 +73,7 @@ svisodd (svbool_t pg, svfloat32_t x)
static inline svbool_t
sv_zeroinfnan (svbool_t pg, svuint32_t i)
{
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
2u * 0x7f800000 - 1);
}
@@ -104,7 +102,7 @@ zeroinfnan (uint32_t ix)
}
/* A scalar subroutine used to fix main power special cases. Similar to the
- preamble of finite_powf except that we do not update ix and sign_bias. This
+ preamble of scalar powf except that we do not update ix and sign_bias. This
is done in the preamble of the SVE powf. */
static inline float
powf_specialcase (float x, float y, float z)
@@ -139,9 +137,14 @@ powf_specialcase (float x, float y, float z)
}
/* Scalar fallback for special case routines with custom signature. */
-static inline svfloat32_t
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
{
+ /* Special cases of x or y: zero, inf and nan. */
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
+ svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
+
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
@@ -171,30 +174,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
/* Polynomial to approximate log1p(r)/ln2. */
svfloat64_t logx = A (0);
- logx = svmla_x (pg, A (1), r, logx);
- logx = svmla_x (pg, A (2), r, logx);
- logx = svmla_x (pg, A (3), r, logx);
- logx = svmla_x (pg, y0, r, logx);
+ logx = svmad_x (pg, r, logx, A (1));
+ logx = svmad_x (pg, r, logx, A (2));
+ logx = svmad_x (pg, r, logx, A (3));
+ logx = svmad_x (pg, r, logx, y0);
*pylogx = svmul_x (pg, y, logx);
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
- svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
- svuint64_t ki = svreinterpret_u64 (kd);
- kd = svsub_x (pg, kd, Shift);
+ svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
+ svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
r = svsub_x (pg, *pylogx, kd);
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
- svuint64_t t
- = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
- svuint64_t ski = svadd_x (pg, ki, sign_bias);
- t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
+ svuint64_t t = svld1_gather_index (
+ svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
+ svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
+ t = svadd_x (svptrue_b64 (), t,
+ svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
svfloat64_t s = svreinterpret_f64 (t);
svfloat64_t p = C (0);
p = svmla_x (pg, C (1), p, r);
p = svmla_x (pg, C (2), p, r);
- p = svmla_x (pg, s, p, svmul_x (pg, s, r));
+ p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
return p;
}
@@ -208,19 +211,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
{
const svbool_t ptrue = svptrue_b64 ();
- /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
- order to perform core computation in double precision. */
+ /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
+ in order to perform core computation in double precision. */
const svbool_t pg_lo = svunpklo (pg);
const svbool_t pg_hi = svunpkhi (pg);
- svfloat64_t y_lo = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
- svfloat64_t y_hi = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
- svfloat32_t z = svreinterpret_f32 (iz);
- svfloat64_t z_lo = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
- svfloat64_t z_hi = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
+ svfloat64_t y_lo
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+ svfloat64_t y_hi
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+ svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
+ svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
svuint64_t i_lo = svunpklo (i);
svuint64_t i_hi = svunpkhi (i);
svint64_t k_lo = svunpklo (k);
@@ -247,9 +247,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
/* Implementation of SVE powf.
Provides the same accuracy as AdvSIMD powf, since it relies on the same
algorithm. The theoretical maximum error is under 2.60 ULPs.
- Maximum measured error is 2.56 ULPs:
- SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
- want 0x1.fd4b06p+127. */
+ Maximum measured error is 2.57 ULPs:
+ SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
+ want 0x1.fff862p+127. */
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
@@ -258,21 +258,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
svuint32_t viy0 = svreinterpret_u32 (y);
/* Negative x cases. */
- svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
- svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
+ svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
/* Set sign_bias and ix depending on sign of x and nature of y. */
- svbool_t yisnotint_xisneg = svpfalse_b ();
+ svbool_t yint_or_xpos = pg;
svuint32_t sign_bias = sv_u32 (0);
svuint32_t vix = vix0;
if (unlikely (svptest_any (pg, xisneg)))
{
/* Determine nature of y. */
- yisnotint_xisneg = svisnotint (xisneg, y);
- svbool_t yisint_xisneg = svisint (xisneg, y);
+ yint_or_xpos = svisint (xisneg, y);
svbool_t yisodd_xisneg = svisodd (xisneg, y);
/* ix set to abs(ix) if y is integer. */
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
/* Set to SignBias if x is negative and y is odd. */
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
}
@@ -283,8 +281,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
svbool_t cmp = svorr_z (pg, xspecial, yspecial);
/* Small cases of x: |x| < 0x1p-126. */
- svbool_t xsmall = svaclt (pg, x, d->small_bound);
- if (unlikely (svptest_any (pg, xsmall)))
+ svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
+ if (unlikely (svptest_any (yint_or_xpos, xsmall)))
{
/* Normalize subnormal x so exponent becomes negative. */
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
@@ -293,44 +291,48 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
vix = svsel (xsmall, vix_norm, vix);
}
/* Part of core computation carried in working precision. */
- svuint32_t tmp = svsub_x (pg, vix, d->off);
- svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
- V_POWF_LOG2_N - 1);
- svuint32_t top = svand_x (pg, tmp, 0xff800000);
- svuint32_t iz = svsub_x (pg, vix, top);
- svint32_t k
- = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
-
- /* Compute core in extended precision and return intermediate ylogx results to
- handle cases of underflow and underflow in exp. */
+ svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
+ svuint32_t i = svand_x (
+ yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+ V_POWF_LOG2_N - 1);
+ svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
+ svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
+ svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
+ (23 - V_POWF_EXP2_TABLE_BITS));
+
+ /* Compute core in extended precision and return intermediate ylogx results
+ to handle cases of underflow and underflow in exp. */
svfloat32_t ylogx;
- svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
+ svfloat32_t ret
+ = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
/* Handle exp special cases of underflow and overflow. */
- svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+ svuint32_t sign
+ = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
svfloat32_t ret_oflow
- = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
+ = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
svfloat32_t ret_uflow = svreinterpret_f32 (sign);
- ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
- ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
+ ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
+ ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
/* Cases of finite y and finite negative x. */
- ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
+ ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
- if (unlikely (svptest_any (pg, cmp)))
- return sv_call_powf_sc (x, y, ret, cmp);
+ if (unlikely (svptest_any (cmp, cmp)))
+ return sv_call_powf_sc (x, y, ret);
return ret;
}
-PL_SIG (SV, F, 2, pow)
-PL_TEST_ULP (SV_NAME_F2 (pow), 2.06)
+TEST_SIG (SV, F, 2, pow)
+TEST_ULP (SV_NAME_F2 (pow), 2.08)
+TEST_DISABLE_FENV (SV_NAME_F2 (pow))
/* Wide intervals spanning the whole domain but shared between x and y. */
-#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
@@ -342,10 +344,10 @@ SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
/* x is negative, y is odd or even integer, or y is real not integer. */
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
/* |x| is inf, y is odd or even integer, or y is real not integer. */
SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
@@ -354,7 +356,8 @@ SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
/* 0.0^y. */
SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
/* 1.0^y. */
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
-PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sin_3u5.c b/math/aarch64/sve/sin.c
index a81f3fc80f3d..7e22515ceb79 100644
--- a/pl/math/sv_sin_3u5.c
+++ b/math/aarch64/sve/sin.c
@@ -1,13 +1,13 @@
/*
* Double-precision SVE sin(x) function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -90,7 +90,9 @@ svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg)
return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd));
}
-PL_SIG (SV, D, 1, sin, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_D1 (sin), 2.73)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000)
+TEST_SIG (SV, D, 1, sin, -3.1, 3.1)
+TEST_ULP (SV_NAME_D1 (sin), 2.73)
+TEST_DISABLE_FENV (SV_NAME_D1 (sin))
+TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sincos_3u5.c b/math/aarch64/sve/sincos.c
index f73550082d5b..26b8bb3c6a5a 100644
--- a/pl/math/sv_sincos_3u5.c
+++ b/math/aarch64/sve/sincos.c
@@ -1,7 +1,7 @@
/*
* Double-precision vector sincos function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,12 +9,22 @@
pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
be linked against the scalar sincosf from math/. */
#define _GNU_SOURCE
-#include <math.h>
-#undef _GNU_SOURCE
-#include "sv_sincos_common.h"
#include "sv_math.h"
-#include "pl_test.h"
+#include "sv_sincos_common.h"
+#include "test_defs.h"
+
+#include <math.h>
+
+/* sincos not available for all scalar libm implementations. */
+#ifndef __GLIBC__
+static void
+sincos (double x, double *out_sin, double *out_cos)
+{
+ *out_sin = sin (x);
+ *out_cos = cos (x);
+}
+#endif
static void NOINLINE
special_case (svfloat64_t x, svbool_t special, double *out_sin,
@@ -50,12 +60,14 @@ _ZGVsMxvl8l8_sincos (svfloat64_t x, double *out_sin, double *out_cos,
special_case (x, special, out_sin, out_cos);
}
-PL_TEST_ULP (_ZGVsMxv_sincos_sin, 2.73)
-PL_TEST_ULP (_ZGVsMxv_sincos_cos, 2.73)
+TEST_DISABLE_FENV (_ZGVsMxv_sincos_sin)
+TEST_DISABLE_FENV (_ZGVsMxv_sincos_cos)
+TEST_ULP (_ZGVsMxv_sincos_sin, 2.73)
+TEST_ULP (_ZGVsMxv_sincos_cos, 2.73)
#define SV_SINCOS_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n)
-SV_SINCOS_INTERVAL (0, 0x1p23, 500000)
-SV_SINCOS_INTERVAL (-0, -0x1p23, 500000)
+ TEST_SYM_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n)
+SV_SINCOS_INTERVAL (0, 0x1p-63, 50000)
+SV_SINCOS_INTERVAL (0x1p-63, 0x1p23, 500000)
SV_SINCOS_INTERVAL (0x1p23, inf, 10000)
-SV_SINCOS_INTERVAL (-0x1p23, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sincosf_1u8.c b/math/aarch64/sve/sincosf.c
index c335de8d3dbb..f3e956ee62e2 100644
--- a/pl/math/sv_sincosf_1u8.c
+++ b/math/aarch64/sve/sincosf.c
@@ -1,7 +1,7 @@
/*
* Single-precision vector sincos function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,12 +9,22 @@
pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
be linked against the scalar sincosf from math/. */
#define _GNU_SOURCE
-#include <math.h>
-#undef _GNU_SOURCE
-#include "sv_sincosf_common.h"
#include "sv_math.h"
-#include "pl_test.h"
+#include "sv_sincosf_common.h"
+#include "test_defs.h"
+
+#include <math.h>
+
+/* sincos not available for all scalar libm implementations. */
+#ifndef __GLIBC__
+static void
+sincosf (float x, float *out_sin, float *out_cos)
+{
+ *out_sin = sinf (x);
+ *out_cos = cosf (x);
+}
+#endif
static void NOINLINE
special_case (svfloat32_t x, svbool_t special, float *out_sin, float *out_cos)
@@ -51,12 +61,14 @@ _ZGVsMxvl4l4_sincosf (svfloat32_t x, float *out_sin, float *out_cos,
special_case (x, special, out_sin, out_cos);
}
-PL_TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17)
-PL_TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31)
+TEST_DISABLE_FENV (_ZGVsMxv_sincosf_sin)
+TEST_DISABLE_FENV (_ZGVsMxv_sincosf_cos)
+TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17)
+TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31)
#define SV_SINCOSF_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \
- PL_TEST_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n)
-SV_SINCOSF_INTERVAL (0, 0x1p20, 500000)
-SV_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
+ TEST_SYM_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n)
+SV_SINCOSF_INTERVAL (0, 0x1p-31, 50000)
+SV_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000)
SV_SINCOSF_INTERVAL (0x1p20, inf, 10000)
-SV_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/sincospi.c b/math/aarch64/sve/sincospi.c
new file mode 100644
index 000000000000..d06ca8cc4165
--- /dev/null
+++ b/math/aarch64/sve/sincospi.c
@@ -0,0 +1,47 @@
+/*
+ * Double-precision SVE sincospi(x, *y, *z) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_defs.h"
+#include "mathlib.h"
+#include "sv_sincospi_common.h"
+
+/* Double-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.09 ULP:
+ _ZGVsMxvl8l8_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+ want 0x1.fd54d0b327cf4p-1.
+ Worst-case error for sin is 3.16 ULP:
+ _ZGVsMxvl8l8_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+ want 0x1.fd2da484ff402p-1.
+ */
+void
+_ZGVsMxvl8l8_sincospi (svfloat64_t x, double *out_sin, double *out_cos,
+ svbool_t pg)
+{
+ const struct sv_sincospi_data *d = ptr_barrier (&sv_sincospi_data);
+
+ svfloat64x2_t sc = sv_sincospi_inline (pg, x, d);
+
+ svst1 (pg, out_sin, svget2 (sc, 0));
+ svst1 (pg, out_cos, svget2 (sc, 1));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVsMxvl8l8_sincospi_sin)
+TEST_DISABLE_FENV (_ZGVsMxvl8l8_sincospi_cos)
+TEST_ULP (_ZGVsMxvl8l8_sincospi_sin, 2.59)
+TEST_ULP (_ZGVsMxvl8l8_sincospi_cos, 2.66)
+# define SV_SINCOSPI_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVsMxvl8l8_sincospi_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVsMxvl8l8_sincospi_cos, lo, hi, n)
+SV_SINCOSPI_INTERVAL (0, 0x1p-63, 10000)
+SV_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000)
+SV_SINCOSPI_INTERVAL (0.5, 0x1p53, 50000)
+SV_SINCOSPI_INTERVAL (0x1p53, inf, 10000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/sincospif.c b/math/aarch64/sve/sincospif.c
new file mode 100644
index 000000000000..20476f9346e9
--- /dev/null
+++ b/math/aarch64/sve/sincospif.c
@@ -0,0 +1,46 @@
+/*
+ * Single-precision SVE sincospi(x, *y, *z) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_defs.h"
+#include "mathlib.h"
+#include "sv_sincospif_common.h"
+
+/* Single-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.04 ULP:
+ _ZGVsMxvl4l4_sincospif_sin(0x1.b51b8p-2) got 0x1.f28b5ep-1 want
+ 0x1.f28b58p-1.
+ Worst-case error for cos is 3.18 ULP:
+ _ZGVsMxvl4l4_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want
+ 0x1.f7cd5p-1. */
+void
+_ZGVsMxvl4l4_sincospif (svfloat32_t x, float *out_sin, float *out_cos,
+ svbool_t pg)
+{
+ const struct sv_sincospif_data *d = ptr_barrier (&sv_sincospif_data);
+
+ svfloat32x2_t sc = sv_sincospif_inline (pg, x, d);
+
+ svst1 (pg, out_sin, svget2 (sc, 0));
+ svst1 (pg, out_cos, svget2 (sc, 1));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (_ZGVsMxvl4l4_sincospif_sin)
+TEST_DISABLE_FENV (_ZGVsMxvl4l4_sincospif_cos)
+TEST_ULP (_ZGVsMxvl4l4_sincospif_sin, 2.54)
+TEST_ULP (_ZGVsMxvl4l4_sincospif_cos, 2.68)
+# define SV_SINCOSPIF_INTERVAL(lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVsMxvl4l4_sincospif_sin, lo, hi, n) \
+ TEST_SYM_INTERVAL (_ZGVsMxvl4l4_sincospif_cos, lo, hi, n)
+SV_SINCOSPIF_INTERVAL (0, 0x1p-31, 10000)
+SV_SINCOSPIF_INTERVAL (0x1p-31, 0.5, 50000)
+SV_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000)
+SV_SINCOSPIF_INTERVAL (0x1p31, inf, 10000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sinf_1u9.c b/math/aarch64/sve/sinf.c
index 675d7b2480f7..62127194d60f 100644
--- a/pl/math/sv_sinf_1u9.c
+++ b/math/aarch64/sve/sinf.c
@@ -1,13 +1,13 @@
/*
* Single-precision SVE sin(x) function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -87,7 +87,9 @@ svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
}
-PL_SIG (SV, F, 1, sin, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_F1 (sin), 1.40)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000)
+TEST_SIG (SV, F, 1, sin, -3.1, 3.1)
+TEST_ULP (SV_NAME_F1 (sin), 1.40)
+TEST_DISABLE_FENV (SV_NAME_F1 (sin))
+TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sinh_3u.c b/math/aarch64/sve/sinh.c
index a01e19caecda..8a35c1c38525 100644
--- a/pl/math/sv_sinh_3u.c
+++ b/math/aarch64/sve/sinh.c
@@ -1,14 +1,14 @@
/*
* Double-precision SVE sinh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "sv_poly_f64.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -96,8 +96,10 @@ svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
return svmul_x (pg, t, halfsign);
}
-PL_SIG (SV, D, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (sinh), 2.08)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000)
+TEST_SIG (SV, D, 1, sinh, -10.0, 10.0)
+TEST_ULP (SV_NAME_D1 (sinh), 2.08)
+TEST_DISABLE_FENV (SV_NAME_D1 (sinh))
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sinhf_2u3.c b/math/aarch64/sve/sinhf.c
index e34ecf378ad3..82b7ee442780 100644
--- a/pl/math/sv_sinhf_2u3.c
+++ b/math/aarch64/sve/sinhf.c
@@ -1,14 +1,13 @@
/*
* Single-precision SVE sinh(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
+#include "test_sig.h"
+#include "test_defs.h"
#include "sv_expm1f_inline.h"
static const struct data
@@ -54,11 +53,13 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
if (unlikely (svptest_any (pg, special)))
return special_case (x, svmul_x (pg, t, halfsign), special);
- return svmul_x (pg, t, halfsign);
+ return svmul_x (svptrue_b32 (), t, halfsign);
}
-PL_SIG (SV, F, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (sinh), 1.76)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
+TEST_SIG (SV, F, 1, sinh, -10.0, 10.0)
+TEST_ULP (SV_NAME_F1 (sinh), 1.76)
+TEST_DISABLE_FENV (SV_NAME_F1 (sinh))
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sinpi_3u1.c b/math/aarch64/sve/sinpi.c
index c9f23da1b19b..8fad3678b172 100644
--- a/pl/math/sv_sinpi_3u1.c
+++ b/math/aarch64/sve/sinpi.c
@@ -1,19 +1,19 @@
/*
* Double-precision SVE sinpi(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "mathlib.h"
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
+#include "mathlib.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f64.h"
static const struct data
{
- double poly[10];
+ double poly[10], range_val;
} data = {
/* Polynomial coefficients generated using Remez algorithm,
see sinpi.sollya for details. */
@@ -21,6 +21,7 @@ static const struct data
-0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16,
0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 },
+ .range_val = 0x1p63,
};
/* A fast SVE implementation of sinpi.
@@ -37,8 +38,9 @@ svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg)
svfloat64_t r = svsub_x (pg, x, n);
/* Result should be negated based on if n is odd or not. */
- svuint64_t intn = svreinterpret_u64 (svcvt_s64_x (pg, n));
- svuint64_t sign = svlsl_z (pg, intn, 63);
+ svbool_t cmp = svaclt (pg, x, d->range_val);
+ svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n));
+ svuint64_t sign = svlsl_z (cmp, intn, 63);
/* y = sin(r). */
svfloat64_t r2 = svmul_x (pg, r, r);
@@ -49,9 +51,12 @@ svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg)
return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
}
-PL_SIG (SV, D, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (SV_NAME_D1 (sinpi), 2.61)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (SV_NAME_D1 (sinpi), 2.61)
+TEST_DISABLE_FENV (SV_NAME_D1 (sinpi))
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_sinpif_2u5.c b/math/aarch64/sve/sinpif.c
index ac3f924bed68..b91768a29cb6 100644
--- a/pl/math/sv_sinpif_2u5.c
+++ b/math/aarch64/sve/sinpif.c
@@ -1,23 +1,24 @@
/*
* Single-precision SVE sinpi(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "mathlib.h"
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
+#include "mathlib.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_poly_f32.h"
static const struct data
{
- float poly[6];
+ float poly[6], range_val;
} data = {
/* Taylor series coefficents for sin(pi * x). */
.poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f,
0x1.50783p-4f, -0x1.e30750p-8f },
+ .range_val = 0x1p31,
};
/* A fast SVE implementation of sinpif.
@@ -34,8 +35,9 @@ svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg)
svfloat32_t r = svsub_x (pg, x, n);
/* Result should be negated based on if n is odd or not. */
- svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n));
- svuint32_t sign = svlsl_z (pg, intn, 31);
+ svbool_t cmp = svaclt (pg, x, d->range_val);
+ svuint32_t intn = svreinterpret_u32 (svcvt_s32_z (pg, n));
+ svuint32_t sign = svlsl_z (cmp, intn, 31);
/* y = sin(r). */
svfloat32_t r2 = svmul_x (pg, r, r);
@@ -45,9 +47,12 @@ svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg)
return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
}
-PL_SIG (SV, F, 1, sinpi, -0.9, 0.9)
-PL_TEST_ULP (SV_NAME_F1 (sinpi), 1.99)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000)
+#if WANT_TRIGPI_TESTS
+TEST_ULP (SV_NAME_F1 (sinpi), 1.99)
+TEST_DISABLE_FENV (SV_NAME_F1 (sinpi))
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/sv_expf_inline.h b/math/aarch64/sve/sv_expf_inline.h
new file mode 100644
index 000000000000..6054e65bb202
--- /dev/null
+++ b/math/aarch64/sve/sv_expf_inline.h
@@ -0,0 +1,66 @@
+/*
+ * SVE helper for single-precision routines which calculate exp(x) and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2023-2025, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_SV_EXPF_INLINE_H
+#define MATH_SV_EXPF_INLINE_H
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+struct sv_expf_data
+{
+ float c1, c3, inv_ln2;
+ float ln2_lo, c0, c2, c4;
+ float ln2_hi, shift;
+};
+
+/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+ compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
+#define SV_EXPF_DATA \
+ { \
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
+ .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
+ .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .shift = 0x1.803f8p17f, \
+ }
+
+#define C(i) sv_f32 (d->poly[i])
+
+static inline svfloat32_t
+expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+{
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
+
+ /* n = round(x/(ln2/N)). */
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
+ svfloat32_t n = svsub_x (pg, z, d->shift);
+
+ /* r = x - n*ln2/N. */
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+ r = svmls_lane (r, n, lane_consts, 0);
+
+ /* scale = 2^(n/N). */
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+
+ /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+#endif // MATH_SV_EXPF_INLINE_H
diff --git a/pl/math/sv_expm1f_inline.h b/math/aarch64/sve/sv_expm1f_inline.h
index a6e2050ff4a6..35892f519690 100644
--- a/pl/math/sv_expm1f_inline.h
+++ b/math/aarch64/sve/sv_expm1f_inline.h
@@ -2,12 +2,12 @@
* SVE helper for single-precision routines which calculate exp(x) - 1 and do
* not need special-case handling
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_SV_EXPM1F_INLINE_H
-#define PL_MATH_SV_EXPM1F_INLINE_H
+#ifndef MATH_SV_EXPM1F_INLINE_H
+#define MATH_SV_EXPM1F_INLINE_H
#include "sv_math.h"
@@ -16,21 +16,18 @@ struct sv_expm1f_data
/* These 4 are grouped together so they can be loaded as one quadword, then
used with _lane forms of svmla/svmls. */
float32_t c2, c4, ln2_hi, ln2_lo;
- float32_t c0, c1, c3, inv_ln2, shift;
+ float c0, inv_ln2, c1, c3, special_bound;
};
/* Coefficients generated using fpminimax. */
#define SV_EXPM1F_DATA \
{ \
- .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
- .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \
+ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \
\
- .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
- .ln2_lo = 0x1.7f7d1cp-20f, \
+ .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \
}
-#define C(i) sv_f32 (d->c##i)
-
static inline svfloat32_t
expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
{
@@ -44,9 +41,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
- j = svsub_x (pg, j, d->shift);
- svint32_t i = svcvt_s32_x (pg, j);
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
+ j = svrinta_x (pg, j);
svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
f = svmls_lane (f, j, lane_constants, 3);
@@ -56,18 +52,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
- svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
svfloat32_t p = svmla_x (pg, p12, f2, p34);
- p = svmla_x (pg, C (0), f, p);
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
p = svmla_x (pg, f, f2, p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
- svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
}
-#endif // PL_MATH_SV_EXPM1F_INLINE_H \ No newline at end of file
+#endif // MATH_SV_EXPM1F_INLINE_H
diff --git a/pl/math/sv_log1p_inline.h b/math/aarch64/sve/sv_log1p_inline.h
index 983f8e1b0413..86a5bb1456f6 100644
--- a/pl/math/sv_log1p_inline.h
+++ b/math/aarch64/sve/sv_log1p_inline.h
@@ -2,14 +2,14 @@
* Helper for SVE double-precision routines which calculate log(1 + x) and do
* not need special-case handling
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_SV_LOG1P_INLINE_H
-#define PL_MATH_SV_LOG1P_INLINE_H
+#ifndef MATH_SV_LOG1P_INLINE_H
+#define MATH_SV_LOG1P_INLINE_H
#include "sv_math.h"
-#include "poly_sve_f64.h"
+#include "sv_poly_f64.h"
static const struct sv_log1p_data
{
@@ -67,8 +67,8 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
svfloat64_t cm;
#ifndef WANT_SV_LOG1P_K0_SHORTCUT
-#error \
- "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+# error \
+ "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
#elif WANT_SV_LOG1P_K0_SHORTCUT
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
that the approximation is solely the polynomial. */
@@ -93,4 +93,4 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
}
-#endif // PL_MATH_SV_LOG1P_INLINE_H
+#endif // MATH_SV_LOG1P_INLINE_H
diff --git a/math/aarch64/sve/sv_log1pf_inline.h b/math/aarch64/sve/sv_log1pf_inline.h
new file mode 100644
index 000000000000..238079c61a5b
--- /dev/null
+++ b/math/aarch64/sve/sv_log1pf_inline.h
@@ -0,0 +1,83 @@
+/*
+ * Helper for SVE routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef MATH_SV_LOG1PF_INLINE_H
+#define MATH_SV_LOG1PF_INLINE_H
+
+#define SignExponentMask 0xff800000
+
+static const struct sv_log1pf_data
+{
+ float c0, c2, c4, c6;
+ float c1, c3, c5, c7;
+ float ln2, exp_bias, quarter;
+ uint32_t four, three_quarters;
+} sv_log1pf_data = {
+ /* Do not store first term of polynomial, which is -0.5, as
+ this can be fmov-ed directly instead of including it in
+ the main load-and-mla polynomial schedule. */
+ .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
+ .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f,
+ .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
+ .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000,
+ .three_quarters = 0x3f400000,
+};
+
+static inline svfloat32_t
+sv_log1pf_inline (svfloat32_t x, svbool_t pg)
+{
+ const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ svfloat32_t m = svadd_x (pg, x, 1);
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ svint32_t k
+ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+ sv_s32 (SignExponentMask));
+
+ /* Scale x by exponent manipulation. */
+ svfloat32_t m_scale = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+ svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
+ m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
+
+ /* Evaluate polynomial on reduced interval. */
+ svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
+
+ svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
+ svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
+ svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
+ svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
+ svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
+
+ svfloat32_t p = svmla_x (pg, p45, p67, ms2);
+ p = svmla_x (pg, p23, p, ms2);
+ p = svmla_x (pg, p01, p, ms2);
+
+ p = svmad_x (pg, m_scale, p, -0.5);
+ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
+ return svmla_lane_f32 (p, scale_back, fconst, 0);
+}
+
+#endif // SV_LOG1PF_INLINE_H
diff --git a/math/aarch64/sve/sv_log_inline.h b/math/aarch64/sve/sv_log_inline.h
new file mode 100644
index 000000000000..a1b169a0b727
--- /dev/null
+++ b/math/aarch64/sve/sv_log_inline.h
@@ -0,0 +1,83 @@
+/*
+ * Double-precision vector log(x) function - inline version
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "math_config.h"
+
+#ifndef SV_LOG_INLINE_POLY_ORDER
+# error Cannot use inline log helper without specifying poly order (options are 4 or 5)
+#endif
+
+#if SV_LOG_INLINE_POLY_ORDER == 4
+# define POLY \
+ { \
+ -0x1.ffffffffcbad3p-2, 0x1.555555578ed68p-2, -0x1.0000d3a1e7055p-2, \
+ 0x1.999392d02a63ep-3 \
+ }
+#elif SV_LOG_INLINE_POLY_ORDER == 5
+# define POLY \
+ { \
+ -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, \
+ 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 \
+ }
+#else
+# error Can only choose order 4 or 5 for log poly
+#endif
+
+struct sv_log_inline_data
+{
+ double poly[SV_LOG_INLINE_POLY_ORDER];
+ double ln2;
+ uint64_t off, sign_exp_mask;
+};
+
+#define SV_LOG_CONSTANTS \
+ { \
+ .poly = POLY, .ln2 = 0x1.62e42fefa39efp-1, \
+ .sign_exp_mask = 0xfff0000000000000, .off = 0x3fe6900900000000 \
+ }
+
+#define P(i) sv_f64 (d->poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
+
+static inline svfloat64_t
+sv_log_inline (svbool_t pg, svfloat64_t x, const struct sv_log_inline_data *d)
+{
+ svuint64_t ix = svreinterpret_u64 (x);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
+ /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
+ The actual value of i is double this due to table layout. */
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+ svint64_t k
+ = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svfloat64_t z = svreinterpret_f64 (iz);
+
+ /* Lookup in 2 global lists (length N). */
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ svfloat64_t r = svmad_x (pg, invc, z, -1);
+ svfloat64_t kd = svcvt_f64_x (pg, k);
+ /* hi = r + log(c) + k*Ln2. */
+ svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = svmla_x (pg, P (2), r, P (3));
+ svfloat64_t p = svmla_x (pg, P (0), r, P (1));
+#if SV_LOG_INLINE_POLY_ORDER == 5
+ y = svmla_x (pg, P (4), r2);
+#endif
+ y = svmla_x (pg, p, r2, y);
+ return svmla_x (pg, hi, r2, y);
+}
diff --git a/pl/math/sv_math.h b/math/aarch64/sve/sv_math.h
index f67fe91803ba..db688a893032 100644
--- a/pl/math/sv_math.h
+++ b/math/aarch64/sve/sv_math.h
@@ -1,24 +1,38 @@
/*
* Wrapper functions for SVE ACLE.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef SV_MATH_H
#define SV_MATH_H
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
+/* Enable SVE in this translation unit. Note, because this is 'pushed' in
+ clang, any file including sv_math.h will have to pop it back off again by
+ ending the source file with CLOSE_SVE_ATTR. It is important that sv_math.h
+ is included first so that all functions have the target attribute. */
+#ifdef __clang__
+# pragma clang attribute push(__attribute__((target("sve"))), \
+ apply_to = any(function))
+# define CLOSE_SVE_ATTR _Pragma("clang attribute pop")
+#else
+# pragma GCC target("+sve")
+# define CLOSE_SVE_ATTR
#endif
-#if WANT_VMATH
+#include <arm_sve.h>
+#include <stdbool.h>
-# include <arm_sve.h>
-# include <stdbool.h>
+#include "math_config.h"
-# include "math_config.h"
+#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f
+#define SV_NAME_D1(fun) _ZGVsMxv_##fun
+#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f
+#define SV_NAME_D2(fun) _ZGVsMxvv_##fun
+#define SV_NAME_F1_L1(fun) _ZGVsMxvl4_##fun##f
+#define SV_NAME_D1_L1(fun) _ZGVsMxvl8_##fun
+#define SV_NAME_F1_L2(fun) _ZGVsMxvl4l4_##fun##f
/* Double precision. */
static inline svint64_t
@@ -129,5 +143,3 @@ sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2,
return y;
}
#endif
-
-#endif
diff --git a/pl/math/poly_sve_f32.h b/math/aarch64/sve/sv_poly_f32.h
index a97e2ced027a..2d73014a4b45 100644
--- a/pl/math/poly_sve_f32.h
+++ b/math/aarch64/sve/sv_poly_f32.h
@@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on single-precision SVE input, using
* various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_POLY_SVE_F32_H
-#define PL_MATH_POLY_SVE_F32_H
+#ifndef MATH_POLY_SVE_F32_H
+#define MATH_POLY_SVE_F32_H
#include <arm_sve.h>
@@ -17,7 +17,7 @@
#define STYPE float
#define VWRAP(f) sv_##f##_f32_x
#define DUP svdup_f32
-#include "poly_sve_generic.h"
+#include "sv_poly_generic.h"
#undef DUP
#undef VWRAP
#undef STYPE
diff --git a/pl/math/poly_sve_f64.h b/math/aarch64/sve/sv_poly_f64.h
index 5fb14b3c1700..f92be9bf8e9c 100644
--- a/pl/math/poly_sve_f64.h
+++ b/math/aarch64/sve/sv_poly_f64.h
@@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on double-precision SVE input, using
* various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_POLY_SVE_F64_H
-#define PL_MATH_POLY_SVE_F64_H
+#ifndef MATH_POLY_SVE_F64_H
+#define MATH_POLY_SVE_F64_H
#include <arm_sve.h>
@@ -17,7 +17,7 @@
#define STYPE double
#define VWRAP(f) sv_##f##_f64_x
#define DUP svdup_f64
-#include "poly_sve_generic.h"
+#include "sv_poly_generic.h"
#undef DUP
#undef VWRAP
#undef STYPE
diff --git a/pl/math/poly_sve_generic.h b/math/aarch64/sve/sv_poly_generic.h
index b568e4cddff3..a1fc59baa8d3 100644
--- a/pl/math/poly_sve_generic.h
+++ b/math/aarch64/sve/sv_poly_generic.h
@@ -2,7 +2,7 @@
* Helpers for evaluating polynomials with various schemes - specific to SVE
* but precision-agnostic.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -299,3 +299,33 @@ static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2,
VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
return svmla_x (pg, p01, x2, p2_18);
}
+
+static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly_even,
+ const STYPE *poly_odd)
+{
+ VTYPE c13 = svld1rq (pg, poly_odd);
+
+ VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
+ VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
+ VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]);
+
+ VTYPE p;
+ p = svmla_x (pg, p23, x2, p45);
+ p = svmla_x (pg, p01, x2, p);
+ return p;
+}
+static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly_even,
+ const STYPE *poly_odd)
+{
+ VTYPE c13 = svld1rq (pg, poly_odd);
+
+ VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2);
+ VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
+
+ VTYPE p29 = svmla_x (pg, p23, x2, p49);
+ VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
+
+ return svmla_x (pg, p01, x2, p29);
+}
diff --git a/pl/math/sv_sincos_common.h b/math/aarch64/sve/sv_sincos_common.h
index f7b58deb90bd..2a537da157b0 100644
--- a/pl/math/sv_sincos_common.h
+++ b/math/aarch64/sve/sv_sincos_common.h
@@ -1,12 +1,12 @@
/*
* Core approximation for double-precision vector sincos
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
+#include "sv_poly_f64.h"
static const struct sv_sincos_data
{
diff --git a/pl/math/sv_sincosf_common.h b/math/aarch64/sve/sv_sincosf_common.h
index 714e996443b3..bda89ed24680 100644
--- a/pl/math/sv_sincosf_common.h
+++ b/math/aarch64/sve/sv_sincosf_common.h
@@ -1,7 +1,7 @@
/*
* Core approximation for single-precision vector sincos
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/aarch64/sve/sv_sincospi_common.h b/math/aarch64/sve/sv_sincospi_common.h
new file mode 100644
index 000000000000..672ebbc8e855
--- /dev/null
+++ b/math/aarch64/sve/sv_sincospi_common.h
@@ -0,0 +1,76 @@
+/*
+ * Core approximation for double-precision SVE sincospi
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "sv_poly_f64.h"
+
+static const struct sv_sincospi_data
+{
+ double c0, c2, c4, c6, c8;
+ double c1, c3, c5, c7, c9;
+ double range_val;
+} sv_sincospi_data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .c0 = 0x1.921fb54442d184p1,
+ .c1 = -0x1.4abbce625be53p2,
+ .c2 = 0x1.466bc6775ab16p1,
+ .c3 = -0x1.32d2cce62dc33p-1,
+ .c4 = 0x1.507834891188ep-4,
+ .c5 = -0x1.e30750a28c88ep-8,
+ .c6 = 0x1.e8f48308acda4p-12,
+ .c7 = -0x1.6fc0032b3c29fp-16,
+ .c8 = 0x1.af86ae521260bp-21,
+ .c9 = -0x1.012a9870eeb7dp-25,
+ /* Exclusive upper bound for a signed integer. */
+ .range_val = 0x1p63
+};
+
+/* Double-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.09 ULP:
+ _ZGVsMxvl8l8_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1
+ want 0x1.fd54d0b327cf4p-1.
+ Worst-case error for cos is 3.16 ULP:
+ _ZGVsMxvl8l8_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1
+ want 0x1.fd2da484ff402p-1.
+ */
+static inline svfloat64x2_t
+sv_sincospi_inline (svbool_t pg, svfloat64_t x,
+ const struct sv_sincospi_data *d)
+{
+ const svbool_t pt = svptrue_b64 ();
+
+ /* r = x - rint(x). */
+ /* pt hints unpredicated instruction. */
+ svfloat64_t rx = svrinta_x (pg, x);
+ svfloat64_t sr = svsub_x (pt, x, rx);
+
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ svfloat64_t cr = svsubr_x (pg, svabs_x (pg, sr), 0.5);
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ /* pt hints unpredicated instruction. */
+ svfloat64_t sr2 = svmul_x (pt, sr, sr);
+ svfloat64_t cr2 = svmul_x (pt, cr, cr);
+ svfloat64_t sr4 = svmul_x (pt, sr2, sr2);
+ svfloat64_t cr4 = svmul_x (pt, cr2, cr2);
+
+ /* If rint(x) is odd, the sign of the result should be inverted for sinpi and
+ re-introduced for cospi. cmp filters rxs that saturate to max sint. */
+ svbool_t cmp = svaclt (pg, x, d->range_val);
+ svuint64_t odd = svlsl_x (pt, svreinterpret_u64 (svcvt_s64_z (pg, rx)), 63);
+ sr = svreinterpret_f64 (sveor_x (pt, svreinterpret_u64 (sr), odd));
+ cr = svreinterpret_f64 (sveor_m (cmp, svreinterpret_u64 (cr), odd));
+
+ svfloat64_t sinpix = svmul_x (
+ pt, sv_lw_pw_horner_9_f64_x (pg, sr2, sr4, &(d->c0), &(d->c1)), sr);
+ svfloat64_t cospix = svmul_x (
+ pt, sv_lw_pw_horner_9_f64_x (pg, cr2, cr4, &(d->c0), &(d->c1)), cr);
+
+ return svcreate2 (sinpix, cospix);
+}
diff --git a/math/aarch64/sve/sv_sincospif_common.h b/math/aarch64/sve/sv_sincospif_common.h
new file mode 100644
index 000000000000..4b9101de74ed
--- /dev/null
+++ b/math/aarch64/sve/sv_sincospif_common.h
@@ -0,0 +1,82 @@
+/*
+ * Helper for single-precision SVE sincospi
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "sv_poly_f32.h"
+
+const static struct sv_sincospif_data
+{
+ float c0, c2, c4;
+ float c1, c3, c5;
+ float range_val;
+} sv_sincospif_data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .c0 = 0x1.921fb6p1f,
+ .c1 = -0x1.4abbcep2f,
+ .c2 = 0x1.466bc6p1f,
+ .c3 = -0x1.32d2ccp-1f,
+ .c4 = 0x1.50783p-4f,
+ .c5 = -0x1.e30750p-8f,
+ /* Exclusive upper bound for a signed integer. */
+ .range_val = 0x1p31f,
+};
+
+/* Single-precision vector function allowing calculation of both sinpi and
+ cospi in one function call, using shared argument reduction and polynomials.
+ Worst-case error for sin is 3.04 ULP:
+ _ZGVsMxvl4l4_sincospif_sin(0x1.b51b8p-2) got 0x1.f28b5ep-1 want
+ 0x1.f28b58p-1.
+ Worst-case error for cos is 3.18 ULP:
+ _ZGVsMxvl4l4_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want
+ 0x1.f7cd5p-1. */
+static inline svfloat32x2_t
+sv_sincospif_inline (svbool_t pg, svfloat32_t x,
+ const struct sv_sincospif_data *d)
+{
+ const svbool_t pt = svptrue_b32 ();
+
+ /* r = x - rint(x). */
+ svfloat32_t rx = svrinta_x (pg, x);
+ svfloat32_t sr = svsub_x (pt, x, rx);
+
+ /* cospi(x) = sinpi(0.5 - abs(r)) for values -1/2 .. 1/2. */
+ svfloat32_t cr = svsubr_x (pt, svabs_x (pg, sr), 0.5f);
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ svfloat32_t sr2 = svmul_x (pt, sr, sr);
+ svfloat32_t sr4 = svmul_x (pt, sr2, sr2);
+ svfloat32_t cr2 = svmul_x (pt, cr, cr);
+ svfloat32_t cr4 = svmul_x (pt, cr2, cr2);
+
+ /* If rint(x) is odd, the sign of the result should be inverted for sinpi and
+ re-introduced for cospi. cmp filters rxs that saturate to max sint. */
+ svbool_t cmp = svaclt (pg, x, d->range_val);
+ svuint32_t odd = svlsl_x (pt, svreinterpret_u32 (svcvt_s32_z (pg, rx)), 31);
+ sr = svreinterpret_f32 (sveor_x (pt, svreinterpret_u32 (sr), odd));
+ cr = svreinterpret_f32 (sveor_m (cmp, svreinterpret_u32 (cr), odd));
+
+ svfloat32_t c135 = svld1rq_f32 (svptrue_b32 (), &d->c1);
+
+ svfloat32_t sp01 = svmla_lane (sv_f32 (d->c0), sr2, c135, 0);
+ svfloat32_t sp23 = svmla_lane (sv_f32 (d->c2), sr2, c135, 1);
+ svfloat32_t sp45 = svmla_lane (sv_f32 (d->c4), sr2, c135, 2);
+
+ svfloat32_t cp01 = svmla_lane (sv_f32 (d->c0), cr2, c135, 0);
+ svfloat32_t cp23 = svmla_lane (sv_f32 (d->c2), cr2, c135, 1);
+ svfloat32_t cp45 = svmla_lane (sv_f32 (d->c4), cr2, c135, 2);
+
+ svfloat32_t sp = svmla_x (pg, sp23, sr4, sp45);
+ svfloat32_t cp = svmla_x (pg, cp23, cr4, cp45);
+
+ sp = svmla_x (pg, sp01, sr4, sp);
+ cp = svmla_x (pg, cp01, cr4, cp);
+
+ svfloat32_t sinpix = svmul_x (pt, sp, sr);
+ svfloat32_t cospix = svmul_x (pt, cp, cr);
+
+ return svcreate2 (sinpix, cospix);
+}
diff --git a/math/aarch64/sve/tan.c b/math/aarch64/sve/tan.c
new file mode 100644
index 000000000000..1dfc5c422d5e
--- /dev/null
+++ b/math/aarch64/sve/tan.c
@@ -0,0 +1,131 @@
+/*
+ * Double-precision SVE tan(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+static const struct data
+{
+ double c2, c4, c6, c8;
+ double poly_1357[4];
+ double c0, inv_half_pi;
+ double half_pi_hi, half_pi_lo, range_val;
+} data = {
+ /* Polynomial generated with FPMinimax. */
+ .c2 = 0x1.ba1ba1bb46414p-5,
+ .c4 = 0x1.226e5e5ecdfa3p-7,
+ .c6 = 0x1.7ea75d05b583ep-10,
+ .c8 = 0x1.4e4fd14147622p-12,
+ .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6,
+ 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 },
+ .c0 = 0x1.5555555555556p-2,
+ .inv_half_pi = 0x1.45f306dc9c883p-1,
+ .half_pi_hi = 0x1.921fb54442d18p0,
+ .half_pi_lo = 0x1.1a62633145c07p-54,
+ .range_val = 0x1p23,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg,
+ svbool_t special)
+{
+ svbool_t use_recip = svcmpeq (
+ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
+
+ svfloat64_t n = svmad_x (pg, p, p, -1);
+ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
+ svfloat64_t swap = n;
+ n = svneg_m (n, use_recip, d);
+ d = svsel (use_recip, swap, d);
+ svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d);
+ return sv_call_f64 (tan, x, y, special);
+}
+
+/* Vector approximation for double-precision tan.
+ Maximum measured error is 3.48 ULP:
+ _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+ want -0x1.f6ccd8ecf7deap+37. */
+svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *dat = ptr_barrier (&data);
+ svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0);
+ /* q = nearest integer to 2 * x / pi. */
+ svfloat64_t q = svmul_lane (x, half_pi_c0, 1);
+ q = svrinta_x (pg, q);
+
+ /* Use q to reduce x to r in [-pi/4, pi/4], by:
+ r = x - q * pi/2, in extended precision. */
+ svfloat64_t r = x;
+ svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi);
+ r = svmls_lane (r, q, half_pi, 0);
+ r = svmls_lane (r, q, half_pi, 1);
+ /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+ formula. */
+ r = svmul_x (svptrue_b64 (), r, 0.5);
+
+ /* Approximate tan(r) using order 8 polynomial.
+ tan(x) is odd, so polynomial has the form:
+ tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+ Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+ Then compute the approximation by:
+ tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
+
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2);
+ svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4);
+ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
+ svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2);
+ svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6);
+
+ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
+ svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0);
+ svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1);
+ svfloat64_t p03 = svmla_x (pg, p01, p23, r4);
+
+ svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0);
+ svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1);
+ svfloat64_t p47 = svmla_x (pg, p45, p67, r4);
+
+ svfloat64_t p = svmla_x (pg, p03, p47, r8);
+
+ svfloat64_t z = svmul_x (svptrue_b64 (), p, r);
+ z = svmul_x (svptrue_b64 (), r2, z);
+ z = svmla_lane (z, r, half_pi_c0, 0);
+ p = svmla_x (pg, r, r2, z);
+
+ /* Recombination uses double-angle formula:
+ tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+ and reciprocity around pi/2:
+ tan(x) = 1 / (tan(pi/2 - x))
+ to assemble result using change-of-sign and conditional selection of
+ numerator/denominator dependent on odd/even-ness of q (quadrant). */
+
+ /* Invert condition to catch NaNs and Infs as well as large values. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
+
+ if (unlikely (svptest_any (pg, special)))
+ {
+ return special_case (x, p, q, pg, special);
+ }
+ svbool_t use_recip = svcmpeq (
+ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
+
+ svfloat64_t n = svmad_x (pg, p, p, -1);
+ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
+ svfloat64_t swap = n;
+ n = svneg_m (n, use_recip, d);
+ d = svsel (use_recip, swap, d);
+ return svdiv_x (pg, n, d);
+}
+
+TEST_SIG (SV, D, 1, tan, -3.1, 3.1)
+TEST_ULP (SV_NAME_D1 (tan), 2.99)
+TEST_DISABLE_FENV (SV_NAME_D1 (tan))
+TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_tanf_3u5.c b/math/aarch64/sve/tanf.c
index 6b8cd1e64b44..d34fc2fc1a4e 100644
--- a/pl/math/sv_tanf_3u5.c
+++ b/math/aarch64/sve/tanf.c
@@ -1,13 +1,13 @@
/*
* Single-precision vector tan(x) function.
*
- * Copyright (c) 2020-2023, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -50,21 +50,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- /* Determine whether input is too large to perform fast regression. */
- svbool_t cmp = svacge (pg, x, d->range_val);
-
svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1);
svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1);
/* n = rint(x/(pi/2)). */
- svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3);
- svfloat32_t n = svsub_x (pg, q, d->shift);
+ svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3));
/* n is already a signed integer, simply convert it. */
svint32_t in = svcvt_s32_x (pg, n);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
svint32_t alt = svand_x (pg, in, 1);
svbool_t pred_alt = svcmpne (pg, alt, 0);
-
/* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */
svfloat32_t r;
r = svmls_lane (x, n, pi_vals, 0);
@@ -83,7 +78,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
/* Evaluate polynomial approximation of tangent on [-pi/4, pi/4],
using Estrin on z^2. */
- svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
@@ -96,24 +91,27 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2));
- /* Transform result back, if necessary. */
- svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
-
/* No need to pass pg to specialcase here since cmp is a strict subset,
guaranteed by the cmpge above. */
+
+ /* Determine whether input is too large to perform fast regression. */
+ svbool_t cmp = svacge (pg, x, d->range_val);
if (unlikely (svptest_any (pg, cmp)))
- return special_case (x, svsel (pred_alt, inv_y, y), cmp);
+ return special_case (x, svdivr_x (pg, y, 1.0f), cmp);
+ svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
return svsel (pred_alt, inv_y, y);
}
-PL_SIG (SV, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_F1 (tan), 2.96)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000)
+TEST_SIG (SV, F, 1, tan, -3.1, 3.1)
+TEST_ULP (SV_NAME_F1 (tan), 2.96)
+TEST_DISABLE_FENV (SV_NAME_F1 (tan))
+TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100)
+TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000)
+TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000)
+TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000)
+TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000)
+TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000)
+TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000)
+TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000)
+CLOSE_SVE_ATTR
diff --git a/pl/math/sv_tanh_3u.c b/math/aarch64/sve/tanh.c
index f54139f1ddbc..41f64cb4b2c7 100644
--- a/pl/math/sv_tanh_3u.c
+++ b/math/aarch64/sve/tanh.c
@@ -1,14 +1,14 @@
/*
* Double-precision SVE tanh(x) function.
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "sv_math.h"
-#include "poly_sve_f64.h"
+#include "sv_poly_f64.h"
#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
static const struct data
{
@@ -89,8 +89,10 @@ svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
return svdiv_x (pg, q, qp2);
}
-PL_SIG (SV, D, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (tanh), 2.27)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
+TEST_SIG (SV, D, 1, tanh, -10.0, 10.0)
+TEST_ULP (SV_NAME_D1 (tanh), 2.27)
+TEST_DISABLE_FENV (SV_NAME_D1 (tanh))
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/tanhf.c b/math/aarch64/sve/tanhf.c
new file mode 100644
index 000000000000..9007e7badb0d
--- /dev/null
+++ b/math/aarch64/sve/tanhf.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision SVE tanh(x) function.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "sv_expm1f_inline.h"
+
+/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */
+#define BoringBound 0x1.205966p+3f
+
+static const struct data
+{
+ struct sv_expm1f_data expm1f_consts;
+ uint32_t onef, special_bound;
+ float boring_bound;
+} data = {
+ .expm1f_consts = SV_EXPM1F_DATA,
+ .onef = 0x3f800000,
+ .special_bound = 0x7f800000,
+ .boring_bound = BoringBound,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
+ svfloat32_t boring, svfloat32_t q, svbool_t special)
+{
+ svfloat32_t y
+ = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
+ return sv_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision SVE tanh(x), using a simplified
+ version of expm1f. The maximum error is 2.57 ULP:
+ _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
+ want 0x1.fb71aap-5. */
+svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
+ svbool_t special = svcmpgt (pg, iax, d->special_bound);
+ svbool_t is_boring = svacgt (pg, x, d->boring_bound);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
+ &d->expm1f_consts);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, pg, is_boring, boring, q, special);
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+ return svsel_f32 (is_boring, boring, y);
+}
+
+TEST_SIG (SV, F, 1, tanh, -10.0, 10.0)
+TEST_ULP (SV_NAME_F1 (tanh), 2.07)
+TEST_DISABLE_FENV (SV_NAME_F1 (tanh))
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, BoringBound, 100000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), BoringBound, inf, 100)
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/tanpi.c b/math/aarch64/sve/tanpi.c
new file mode 100644
index 000000000000..d9e7d2487d53
--- /dev/null
+++ b/math/aarch64/sve/tanpi.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision vector tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_sig.h"
+#include "test_defs.h"
+
+const static struct v_tanpi_data
+{
+ double c0, c2, c4, c6, c8, c10, c12;
+ double c1, c3, c5, c7, c9, c11, c13, c14;
+} tanpi_data = {
+ /* Coefficents for tan(pi * x) computed with fpminimax
+ on [ 0x1p-1022 0x1p-2 ]
+ approx rel error: 0x1.7eap-55
+ approx abs error: 0x1.7eap-55. */
+ .c0 = 0x1.921fb54442d18p1, /* pi. */
+ .c1 = 0x1.4abbce625be52p3, .c2 = 0x1.466bc6775b0f9p5,
+ .c3 = 0x1.45fff9b426f5ep7, .c4 = 0x1.45f4730dbca5cp9,
+ .c5 = 0x1.45f3265994f85p11, .c6 = 0x1.45f4234b330cap13,
+ .c7 = 0x1.45dca11be79ebp15, .c8 = 0x1.47283fc5eea69p17,
+ .c9 = 0x1.3a6d958cdefaep19, .c10 = 0x1.927896baee627p21,
+ .c11 = -0x1.89333f6acd922p19, .c12 = 0x1.5d4e912bb8456p27,
+ .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32,
+};
+
+/* Approximation for double-precision vector tanpi(x)
+ The maximum error is 3.06 ULP:
+ _ZGVsMxv_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3
+ want -0x1.fa30112702c95p+3. */
+svfloat64_t SV_NAME_D1 (tanpi) (svfloat64_t x, const svbool_t pg)
+{
+ const struct v_tanpi_data *d = ptr_barrier (&tanpi_data);
+
+ svfloat64_t n = svrintn_x (pg, x);
+
+ /* inf produces nan that propagates. */
+ svfloat64_t xr = svsub_x (pg, x, n);
+ svfloat64_t ar = svabd_x (pg, x, n);
+ svbool_t flip = svcmpgt (pg, ar, 0.25);
+ svfloat64_t r = svsel (flip, svsubr_x (pg, ar, 0.5), ar);
+
+ /* Order-14 pairwise Horner. */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t r4 = svmul_x (pg, r2, r2);
+
+ svfloat64_t c_1_3 = svld1rq (pg, &d->c1);
+ svfloat64_t c_5_7 = svld1rq (pg, &d->c5);
+ svfloat64_t c_9_11 = svld1rq (pg, &d->c9);
+ svfloat64_t c_13_14 = svld1rq (pg, &d->c13);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r2, c_1_3, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r2, c_1_3, 1);
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r2, c_5_7, 0);
+ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), r2, c_5_7, 1);
+ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), r2, c_9_11, 0);
+ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), r2, c_9_11, 1);
+ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), r2, c_13_14, 0);
+
+ svfloat64_t p = svmla_lane (p1213, r4, c_13_14, 1);
+ p = svmad_x (pg, p, r4, p1011);
+ p = svmad_x (pg, p, r4, p89);
+ p = svmad_x (pg, p, r4, p67);
+ p = svmad_x (pg, p, r4, p45);
+ p = svmad_x (pg, p, r4, p23);
+ p = svmad_x (pg, p, r4, p01);
+ p = svmul_x (pg, r, p);
+
+ svfloat64_t p_recip = svdivr_x (pg, p, 1.0);
+ svfloat64_t y = svsel (flip, p_recip, p);
+
+ svuint64_t sign
+ = sveor_x (pg, svreinterpret_u64 (xr), svreinterpret_u64 (ar));
+ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (SV_NAME_D1 (tanpi))
+TEST_ULP (SV_NAME_D1 (tanpi), 2.57)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0.5, 1.0, 200000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 1.0, 0x1p23, 50000)
+TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0x1p23, inf, 50000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/sve/tanpif.c b/math/aarch64/sve/tanpif.c
new file mode 100644
index 000000000000..2ba968a799fe
--- /dev/null
+++ b/math/aarch64/sve/tanpif.c
@@ -0,0 +1,68 @@
+/*
+ * Single-precision vector tanpif(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "test_defs.h"
+#include "test_sig.h"
+
+const static struct v_tanpif_data
+{
+ float c0, c2, c4, c6;
+ float c1, c3, c5, c7;
+} tanpif_data = {
+ /* Coefficients for tan(pi * x). */
+ .c0 = 0x1.921fb4p1f, .c1 = 0x1.4abbcep3f, .c2 = 0x1.466b8p5f,
+ .c3 = 0x1.461c72p7f, .c4 = 0x1.42e9d4p9f, .c5 = 0x1.69e2c4p11f,
+ .c6 = 0x1.e85558p11f, .c7 = 0x1.a52e08p16f,
+};
+
+/* Approximation for single-precision vector tanpif(x)
+ The maximum error is 3.34 ULP:
+ _ZGVsMxv_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
+ want 0x1.f70aa6p+2. */
+svfloat32_t SV_NAME_F1 (tanpi) (svfloat32_t x, const svbool_t pg)
+{
+ const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
+ svfloat32_t odd_coeffs = svld1rq (pg, &d->c1);
+ svfloat32_t n = svrintn_x (pg, x);
+
+ /* inf produces nan that propagates. */
+ svfloat32_t xr = svsub_x (pg, x, n);
+ svfloat32_t ar = svabd_x (pg, x, n);
+ svbool_t flip = svcmpgt (pg, ar, 0.25f);
+ svfloat32_t r = svsel (flip, svsub_x (pg, sv_f32 (0.5f), ar), ar);
+
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r4 = svmul_x (pg, r2, r2);
+
+ /* Order-7 Pairwise Horner. */
+ svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), r2, odd_coeffs, 0);
+ svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), r2, odd_coeffs, 1);
+ svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), r2, odd_coeffs, 2);
+ svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), r2, odd_coeffs, 3);
+ svfloat32_t p = svmad_x (pg, p67, r4, p45);
+ p = svmad_x (pg, p, r4, p23);
+ p = svmad_x (pg, p, r4, p01);
+ svfloat32_t poly = svmul_x (pg, r, p);
+
+ svfloat32_t poly_recip = svdiv_x (pg, sv_f32 (1.0), poly);
+ svfloat32_t y = svsel (flip, poly_recip, poly);
+
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (xr), svreinterpret_u32 (ar));
+ return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
+}
+
+#if WANT_TRIGPI_TESTS
+TEST_DISABLE_FENV (SV_NAME_F1 (tanpi))
+TEST_ULP (SV_NAME_F1 (tanpi), 2.84)
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
+TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
+#endif
+CLOSE_SVE_ATTR
diff --git a/math/aarch64/tanpi_2u5.c b/math/aarch64/tanpi_2u5.c
new file mode 100644
index 000000000000..154b9faf454d
--- /dev/null
+++ b/math/aarch64/tanpi_2u5.c
@@ -0,0 +1,158 @@
+/*
+ * Double-precision scalar tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#include "math_config.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "poly_scalar_f64.h"
+
+#define SIGN_MASK 0x8000000000000000
+
+const static struct tanpi_data
+{
+ double tan_poly[14], cot_poly[9], pi, invpi;
+} tanpi_data = {
+ /* Coefficents for tan(pi * x). */
+ .tan_poly = {
+ 0x1.4abbce625be52p3,
+ 0x1.466bc6775b0f9p5,
+ 0x1.45fff9b426f5ep7,
+ 0x1.45f4730dbca5cp9,
+ 0x1.45f3265994f85p11,
+ 0x1.45f4234b330cap13,
+ 0x1.45dca11be79ebp15,
+ 0x1.47283fc5eea69p17,
+ 0x1.3a6d958cdefaep19,
+ 0x1.927896baee627p21,
+ -0x1.89333f6acd922p19,
+ 0x1.5d4e912bb8456p27,
+ -0x1.a854d53ab6874p29,
+ 0x1.1b76de7681424p32,
+ },
+ /* Coefficents for cot(pi * x). */
+ .cot_poly = {
+ -0x1.0c152382d7366p0,
+ -0x1.60c8539c1d316p-1,
+ -0x1.4b9a2f3516354p-1,
+ -0x1.47474060b6ba8p-1,
+ -0x1.464633ad9dcb1p-1,
+ -0x1.45ff229d7edd6p-1,
+ -0x1.46d8dbf492923p-1,
+ -0x1.3873892311c6bp-1,
+ -0x1.b2f3d0ff96d73p-1,
+ },
+ .pi = 0x1.921fb54442d18p1,
+ .invpi = 0x1.45f306dc9c883p-2,
+};
+
+/* Double-precision scalar tanpi(x) implementation.
+ Maximum error 2.19 ULP:
+ tanpi(0x1.68847e177a855p-2) got 0x1.fe9a0ff9bb9d7p+0
+ want 0x1.fe9a0ff9bb9d5p+0. */
+double
+arm_math_tanpi (double x)
+{
+ uint64_t xabs_12 = asuint64 (x) >> 52 & 0x7ff;
+
+ /* x >= 0x1p54. */
+ if (unlikely (xabs_12 >= 0x434))
+ {
+ /* tanpi(+/-inf) and tanpi(+/-nan) = nan. */
+ if (unlikely (xabs_12 == 0x7ff))
+ {
+ return __math_invalid (x);
+ }
+
+ uint64_t x_sign = asuint64 (x) & SIGN_MASK;
+ return asdouble (x_sign);
+ }
+
+ const struct tanpi_data *d = ptr_barrier (&tanpi_data);
+
+ double rounded = round (x);
+ if (unlikely (rounded == x))
+ {
+ /* If x == 0, return with sign. */
+ if (x == 0)
+ {
+ return x;
+ }
+ /* Otherwise, return zero with alternating sign. */
+ int64_t m = (int64_t) rounded;
+ if (x < 0)
+ {
+ return m & 1 ? 0.0 : -0.0;
+ }
+ else
+ {
+ return m & 1 ? -0.0 : 0.0;
+ }
+ }
+
+ double x_reduced = x - rounded;
+ double abs_x_reduced = 0.5 - fabs (x_reduced);
+
+ /* Prevent underflow exceptions. x <= 0x1p-63. */
+ if (unlikely (xabs_12 < 0x3c0))
+ {
+ return d->pi * x;
+ }
+
+ double result, offset, scale;
+
+ /* Test 0.25 < abs_x < 0.5 independent from abs_x_reduced. */
+ double x2 = x + x;
+ int64_t rounded_x2 = (int64_t) round (x2);
+ if (rounded_x2 & 1)
+ {
+ double r_x = abs_x_reduced;
+
+ double r_x2 = r_x * r_x;
+ double r_x4 = r_x2 * r_x2;
+
+ uint64_t sign = asuint64 (x_reduced) & SIGN_MASK;
+ r_x = asdouble (asuint64 (r_x) ^ sign);
+
+ // calculate sign for half-fractional inf values
+ uint64_t is_finite = asuint64 (abs_x_reduced);
+ uint64_t is_odd = (rounded_x2 & 2) << 62;
+ uint64_t is_neg = rounded_x2 & SIGN_MASK;
+ uint64_t keep_sign = is_finite | (is_odd ^ is_neg);
+ offset = d->invpi / (keep_sign ? r_x : -r_x);
+ scale = r_x;
+
+ result = pw_horner_8_f64 (r_x2, r_x4, d->cot_poly);
+ }
+ else
+ {
+ double r_x2 = x_reduced * x_reduced;
+ double r_x4 = r_x2 * r_x2;
+
+ offset = d->pi * x_reduced;
+ scale = x_reduced * r_x2;
+
+ result = pw_horner_13_f64 (r_x2, r_x4, d->tan_poly);
+ }
+
+ return fma (scale, result, offset);
+}
+
+#if WANT_EXPERIMENTAL_MATH
+double
+tanpi (double x)
+{
+ return arm_math_tanpi (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_tanpi, 1.69)
+TEST_SYM_INTERVAL (arm_math_tanpi, 0, 0x1p-63, 50000)
+TEST_SYM_INTERVAL (arm_math_tanpi, 0x1p-63, 0.5, 100000)
+TEST_SYM_INTERVAL (arm_math_tanpi, 0.5, 0x1p53, 100000)
+TEST_SYM_INTERVAL (arm_math_tanpi, 0x1p53, inf, 100000)
+#endif
diff --git a/math/aarch64/tanpif_3u1.c b/math/aarch64/tanpif_3u1.c
new file mode 100644
index 000000000000..8cd66594c290
--- /dev/null
+++ b/math/aarch64/tanpif_3u1.c
@@ -0,0 +1,145 @@
+/*
+ * Single-precision scalar tanpi(x) function.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "mathlib.h"
+#include "math_config.h"
+#include "test_sig.h"
+#include "test_defs.h"
+#include "poly_scalar_f32.h"
+
+const static struct tanpif_data
+{
+ float tan_poly[6], cot_poly[4], pi, invpi;
+} tanpif_data = {
+ /* Coefficents for tan(pi * x). */
+ .tan_poly = {
+ 0x1.4abbc8p3,
+ 0x1.467284p5,
+ 0x1.44cf12p7,
+ 0x1.596b5p9,
+ 0x1.753858p10,
+ 0x1.76ff52p14,
+ },
+ /* Coefficents for cot(pi * x). */
+ .cot_poly = {
+ -0x1.0c1522p0,
+ -0x1.60ce32p-1,
+ -0x1.49cd42p-1,
+ -0x1.73f786p-1,
+ },
+ .pi = 0x1.921fb6p1f,
+ .invpi = 0x1.45f308p-2f,
+};
+
+/* Single-precision scalar tanpi(x) implementation.
+ Maximum error 2.56 ULP:
+ tanpif(0x1.4bf948p-1) got -0x1.fcc9ep+0
+ want -0x1.fcc9e6p+0. */
+float
+arm_math_tanpif (float x)
+{
+ uint32_t xabs_12 = asuint (x) >> 20 & 0x7f8;
+
+ /* x >= 0x1p24f. */
+ if (unlikely (xabs_12 >= 0x4b1))
+ {
+ /* tanpif(+/-inf) and tanpif(+/-nan) = nan. */
+ if (unlikely (xabs_12 == 0x7f8))
+ {
+ return __math_invalidf (x);
+ }
+
+ uint32_t x_sign = asuint (x) & 0x80000000;
+ return asfloat (x_sign);
+ }
+
+ const struct tanpif_data *d = ptr_barrier (&tanpif_data);
+
+ /* Prevent underflow exceptions. x <= 0x1p-31. */
+ if (unlikely (xabs_12 < 0x300))
+ {
+ return d->pi * x;
+ }
+
+ float rounded = roundf (x);
+ if (unlikely (rounded == x))
+ {
+ /* If x == 0, return with sign. */
+ if (x == 0)
+ {
+ return x;
+ }
+ /* Otherwise, return zero with alternating sign. */
+ int32_t m = (int32_t) rounded;
+ if (x < 0)
+ {
+ return m & 1 ? 0.0f : -0.0f;
+ }
+ else
+ {
+ return m & 1 ? -0.0f : 0.0f;
+ }
+ }
+
+ float x_reduced = x - rounded;
+ float abs_x_reduced = 0.5f - asfloat (asuint (x_reduced) & 0x7fffffff);
+
+ float result, offset, scale;
+
+ /* Test 0.25 < abs_x < 0.5 independent from abs_x_reduced. */
+ float x2 = x + x;
+ int32_t rounded_x2 = (int32_t) roundf (x2);
+ if (rounded_x2 & 1)
+ {
+ float r_x = abs_x_reduced;
+
+ float r_x2 = r_x * r_x;
+ float r_x4 = r_x2 * r_x2;
+
+ uint32_t sign = asuint (x_reduced) & 0x80000000;
+ r_x = asfloat (asuint (r_x) ^ sign);
+
+ // calculate sign for half-fractional inf values
+ uint32_t is_finite = asuint (abs_x_reduced);
+ uint32_t is_odd = (rounded_x2 & 2) << 30;
+ uint32_t is_neg = rounded_x2 & 0x80000000;
+ uint32_t keep_sign = is_finite | (is_odd ^ is_neg);
+ offset = d->invpi / (keep_sign ? r_x : -r_x);
+ scale = r_x;
+
+ result = pairwise_poly_3_f32 (r_x2, r_x4, d->cot_poly);
+ }
+ else
+ {
+ float r_x = x_reduced;
+
+ float r_x2 = r_x * r_x;
+ float r_x4 = r_x2 * r_x2;
+
+ offset = d->pi * r_x;
+ scale = r_x * r_x2;
+
+ result = pw_horner_5_f32 (r_x2, r_x4, d->tan_poly);
+ }
+
+ return fmaf (scale, result, offset);
+}
+
+#if WANT_EXPERIMENTAL_MATH
+float
+tanpif (float x)
+{
+ return arm_math_tanpif (x);
+}
+#endif
+
+#if WANT_TRIGPI_TESTS
+TEST_ULP (arm_math_tanpif, 2.57)
+TEST_SYM_INTERVAL (arm_math_tanpif, 0, 0x1p-31f, 50000)
+TEST_SYM_INTERVAL (arm_math_tanpif, 0x1p-31f, 0.5, 100000)
+TEST_SYM_INTERVAL (arm_math_tanpif, 0.5, 0x1p23f, 100000)
+TEST_SYM_INTERVAL (arm_math_tanpif, 0x1p23f, inf, 100000)
+#endif
diff --git a/pl/math/erf_data.c b/math/aarch64/v_erf_data.c
index 138e03578e77..5400d6b8d0e3 100644
--- a/pl/math/erf_data.c
+++ b/math/aarch64/v_erf_data.c
@@ -1,20 +1,20 @@
/*
* Data for approximation of erf.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* Lookup table used in erf.
+/* Lookup table used in vector erf.
For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = 6.0 (769 values):
- - the first entry __erff_data.tab.erf contains the values of erf(r),
- - the second entry __erff_data.tab.scale contains the values of
+ - the first entry __v_erff_data.tab.erf contains the values of erf(r),
+ - the second entry __v_erff_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
algorithm, since lookup is performed only for x >= 1/64-1/512. */
-const struct erf_data __erf_data = {
+const struct v_erf_data __v_erf_data = {
.tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
{ 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
{ 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
diff --git a/pl/math/erfc_data.c b/math/aarch64/v_erfc_data.c
index 40f72a4d6d5b..6acd96f74be5 100644
--- a/pl/math/erfc_data.c
+++ b/math/aarch64/v_erfc_data.c
@@ -1,20 +1,20 @@
/*
* Data used in double-precision erfc(x) function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* Lookup table used in erfc.
+/* Lookup table used in vector erfc.
For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = ~27.0 (3488 values):
- - the first entry __erfc_data.tab.erfc contains the values of erfc(r),
- - the second entry __erfc_data.tab.scale contains the values of
+ - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
+ - the second entry __v_erfc_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^128 (fits in 8bit). */
-const struct erfc_data __erfc_data = {
+const struct v_erfc_data __v_erfc_data = {
.tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
{ 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
{ 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
diff --git a/pl/math/erfcf_data.c b/math/aarch64/v_erfcf_data.c
index a54e11973819..9f992b4887fb 100644
--- a/pl/math/erfcf_data.c
+++ b/math/aarch64/v_erfcf_data.c
@@ -1,20 +1,20 @@
/*
* Data used in single-precision erfc(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* Lookup table used in erfcf.
+/* Lookup table used in vector erfcf.
For each possible rounded input r (multiples of 1/64), between
r = 0.0 and r = 10.0625 (645 values):
- - the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
- - the second entry __erfcf_data.tab.scale contains the values of
+ - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
+ - the second entry __v_erfcf_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^47 (fits in 8 bits). */
-const struct erfcf_data __erfcf_data = {
+const struct v_erfcf_data __v_erfcf_data = {
.tab = { { 0x1p47, 0x1.20dd76p47 },
{ 0x1.f6f944p46, 0x1.20cb68p47 },
{ 0x1.edf3aap46, 0x1.209546p47 },
diff --git a/pl/math/erff_data.c b/math/aarch64/v_erff_data.c
index 84c0d2e95463..8d11d8b6c10b 100644
--- a/pl/math/erff_data.c
+++ b/math/aarch64/v_erff_data.c
@@ -1,20 +1,20 @@
/*
* Data for approximation of erff.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* Lookup table used in erff.
+/* Lookup table used in vector erff.
For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = 4.0 (513 values):
- - the first entry __erff_data.tab.erf contains the values of erf(r),
- - the second entry __erff_data.tab.scale contains the values of
+ - the first entry __v_erff_data.tab.erf contains the values of erf(r),
+ - the second entry __v_erff_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
algorithm, since lookup is performed only for x >= 1/64-1/512. */
-const struct erff_data __erff_data = {
+const struct v_erff_data __v_erff_data = {
.tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
{ 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
{ 0x1.20d770p-6, 0x1.20cb68p+0 },
diff --git a/math/aarch64/v_exp2f_1u.c b/math/aarch64/v_exp2f_1u.c
deleted file mode 100644
index ba6b02fbb4bc..000000000000
--- a/math/aarch64/v_exp2f_1u.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const float Poly[] = {
- /* maxerr: 0.878 ulp. */
- 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-#define C5 v_f32 (Poly[5])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-static float32x4_t VPCS_ATTR NOINLINE
-specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
- float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
- uint32x4_t cmp = absn > v_f32 (192.0f);
- float32x4_t r1 = s1 * s1;
- float32x4_t r0 = poly * s1 * s2;
- return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
- | (~cmp & vreinterpretq_u32_f32 (r0)));
-}
-
-float32x4_t VPCS_ATTR
-_ZGVnN4v_exp2f_1u (float32x4_t x)
-{
- float32x4_t n, r, scale, poly, absn;
- uint32x4_t cmp, e;
-
- /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
-#if 0
- float32x4_t z;
- z = x + Shift;
- n = z - Shift;
- r = x - n;
- e = vreinterpretq_u32_f32 (z) << 23;
-#else
- n = vrndaq_f32 (x);
- r = x - n;
- e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
-#endif
- scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
- absn = vabsq_f32 (n);
- cmp = absn > v_f32 (126.0f);
- poly = vfmaq_f32 (C1, C0, r);
- poly = vfmaq_f32 (C2, poly, r);
- poly = vfmaq_f32 (C3, poly, r);
- poly = vfmaq_f32 (C4, poly, r);
- poly = vfmaq_f32 (C5, poly, r);
- poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn);
- return scale * poly;
-}
diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c
index 45f0848cac5b..59db77ac58cc 100644
--- a/math/aarch64/v_exp_data.c
+++ b/math/aarch64/v_exp_data.c
@@ -1,17 +1,14 @@
/*
- * Lookup table for double-precision e^x vector function.
+ * Scale values for vector exp and exp2
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "v_math.h"
+#include "math_config.h"
-# define N (1 << V_EXP_TABLE_BITS)
-
-/* 2^(j/N), j=0..N. */
+/* 2^(j/N), j=0..N, N=2^7=128. */
const uint64_t __v_exp_data[] = {
-# if N == 128
0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
@@ -55,92 +52,4 @@ const uint64_t __v_exp_data[] = {
0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
-# elif N == 256
- 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
- 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
- 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
- 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
- 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
- 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
- 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
- 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
- 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
- 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
- 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
- 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
- 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
- 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
- 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
- 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
- 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
- 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
- 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
- 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
- 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
- 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
- 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
- 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
- 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
- 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
- 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
- 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
- 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
- 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
- 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
- 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
- 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
- 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
- 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
- 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
- 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
- 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
- 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
- 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
- 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
- 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
- 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
- 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
- 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
- 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
- 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
- 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
- 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
- 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
- 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
- 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
- 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
- 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
- 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
- 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
- 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
- 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
- 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
- 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
- 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
- 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
- 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
- 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
- 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
- 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
- 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
- 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
- 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
- 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
- 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
- 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
- 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
- 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
- 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
- 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
- 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
- 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
- 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
- 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
- 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
- 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
- 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
- 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
- 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
- 0x3feff9d96b2a23d9,
-# endif
};
diff --git a/pl/math/v_exp_tail_data.c b/math/aarch64/v_exp_tail_data.c
index 989dd41d949a..5cc58a40b6b7 100644
--- a/pl/math/v_exp_tail_data.c
+++ b/math/aarch64/v_exp_tail_data.c
@@ -1,13 +1,13 @@
/*
* Lookup table for double-precision e^x vector function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* 2^(j/N), j=0..N, N=2^8=256. Copied from math/v_exp_data.c. */
+/* 2^(j/N), j=0..N, N=2^8=256. */
const uint64_t __v_exp_tail_data[] = {
0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
diff --git a/math/aarch64/v_expf_1u.c b/math/aarch64/v_expf_1u.c
deleted file mode 100644
index 43d03fa34efa..000000000000
--- a/math/aarch64/v_expf_1u.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const float Poly[] = {
- /* maxerr: 0.36565 +0.5 ulp. */
- 0x1.6a6000p-10f,
- 0x1.12718ep-7f,
- 0x1.555af0p-5f,
- 0x1.555430p-3f,
- 0x1.fffff4p-2f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-static float32x4_t VPCS_ATTR NOINLINE
-specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
- float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
- uint32x4_t cmp = absn > v_f32 (192.0f);
- float32x4_t r1 = s1 * s1;
- float32x4_t r0 = poly * s1 * s2;
- return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
- | (~cmp & vreinterpretq_u32_f32 (r0)));
-}
-
-float32x4_t VPCS_ATTR
-_ZGVnN4v_expf_1u (float32x4_t x)
-{
- float32x4_t n, r, scale, poly, absn, z;
- uint32x4_t cmp, e;
-
- /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-#if 1
- z = vfmaq_f32 (Shift, x, InvLn2);
- n = z - Shift;
- r = vfmaq_f32 (x, n, -Ln2hi);
- r = vfmaq_f32 (r, n, -Ln2lo);
- e = vreinterpretq_u32_f32 (z) << 23;
-#else
- z = x * InvLn2;
- n = vrndaq_f32 (z);
- r = vfmaq_f32 (x, n, -Ln2hi);
- r = vfmaq_f32 (r, n, -Ln2lo);
- e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
-#endif
- scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
- absn = vabsq_f32 (n);
- cmp = absn > v_f32 (126.0f);
- poly = vfmaq_f32 (C1, C0, r);
- poly = vfmaq_f32 (C2, poly, r);
- poly = vfmaq_f32 (C3, poly, r);
- poly = vfmaq_f32 (C4, poly, r);
- poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
- poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn);
- return scale * poly;
-}
diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c
deleted file mode 100644
index 1d1c1fa62c04..000000000000
--- a/math/aarch64/v_log.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Double-precision vector log(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- uint64x2_t min_norm;
- uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t ln2;
- uint64x2_t sign_exp_mask;
-} data = {
- /* Worst-case error: 1.17 + 0.5 ulp.
- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
- .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
- V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
- V2 (-0x1.554e550bd501ep-3) },
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .min_norm = V2 (0x0010000000000000),
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
- .sign_exp_mask = V2 (0xfff0000000000000)
-};
-
-#define A(i) d->poly[i]
-#define N (1 << V_LOG_TABLE_BITS)
-#define IndexMask (N - 1)
-#define Off v_u64 (0x3fe6900900000000)
-
-struct entry
-{
- float64x2_t invc;
- float64x2_t logc;
-};
-
-static inline struct entry
-lookup (uint64x2_t i)
-{
- /* Since N is a power of 2, n % N = n & (N - 1). */
- struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
- float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
- float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
- e.invc = vuzp1q_f64 (e0, e1);
- e.logc = vuzp2q_f64 (e0, e1);
- return e;
-}
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
- uint32x2_t cmp)
-{
- return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
-}
-
-float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float64x2_t z, r, r2, p, y, kd, hi;
- uint64x2_t ix, iz, tmp;
- uint32x2_t cmp;
- int64x2_t k;
- struct entry e;
-
- ix = vreinterpretq_u64_f64 (x);
- cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
-
- /* x = 2^k z; where z is in range [Off,2*Off) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- tmp = vsubq_u64 (ix, Off);
- k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
- iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
- z = vreinterpretq_f64_u64 (iz);
- e = lookup (tmp);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
- kd = vcvtq_f64_s64 (k);
-
- /* hi = r + log(c) + k*Ln2. */
- hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- r2 = vmulq_f64 (r, r);
- y = vfmaq_f64 (A (2), A (3), r);
- p = vfmaq_f64 (A (0), A (1), r);
- y = vfmaq_f64 (y, A (4), r2);
- y = vfmaq_f64 (p, y, r2);
-
- if (unlikely (v_any_u32h (cmp)))
- return special_case (x, y, hi, r2, cmp);
- return vfmaq_f64 (hi, y, r2);
-}
diff --git a/pl/math/v_log10_data.c b/math/aarch64/v_log10_data.c
index d9a624dab9ce..bae2685822f6 100644
--- a/pl/math/v_log10_data.c
+++ b/math/aarch64/v_log10_data.c
@@ -1,7 +1,7 @@
/*
* Lookup table for double-precision log10(x) vector function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/v_log2_data.c b/math/aarch64/v_log2_data.c
index 50697daff925..fad91d654da8 100644
--- a/pl/math/v_log2_data.c
+++ b/math/aarch64/v_log2_data.c
@@ -1,7 +1,7 @@
/*
* Coefficients and table entries for vector log2
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c
index 82351bb14766..4f0e6e167381 100644
--- a/math/aarch64/v_log_data.c
+++ b/math/aarch64/v_log_data.c
@@ -1,30 +1,35 @@
/*
* Lookup table for double-precision log(x) vector function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "v_math.h"
-
-#define N (1 << V_LOG_TABLE_BITS)
+#include "math_config.h"
const struct v_log_data __v_log_data = {
+ /* Worst-case error: 1.17 + 0.5 ulp.
+ Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+ 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 },
+ .ln2 = 0x1.62e42fefa39efp-1,
/* Algorithm:
x = 2^k z
log(x) = k ln2 + log(c) + poly(z/c - 1)
- where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
- N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
+ where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+ N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
+ tables:
table[i].invc = 1/c
table[i].logc = (double)log(c)
- where c is near the center of the subinterval and is chosen by trying several
- floating point invc candidates around 1/center and selecting one for which
- the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
- that contains 1 and the previous one got tweaked to avoid cancellation. */
+ where c is near the center of the subinterval and is chosen by trying
+ several floating point invc candidates around 1/center and selecting one
+ for which the error in (double)log(c) is minimized (< 0x1p-74), except the
+ subinterval that contains 1 and the previous one got tweaked to avoid
+ cancellation. */
.table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
{ 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
{ 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c
deleted file mode 100644
index 66ebbbcd2b5a..000000000000
--- a/math/aarch64/v_logf.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Single-precision vector log function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
- uint32x4_t min_norm;
- uint16x8_t special_bound;
- float32x4_t poly[7];
- float32x4_t ln2, tiny_bound;
- uint32x4_t off, mantissa_mask;
-} data = {
- /* 3.34 ulp error. */
- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
- V4 (-0x1.ffffc8p-2f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x1p-126),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
- .off = V4 (0x3f2aaaab), /* 0.666667. */
- .mantissa_mask = V4 (0x007fffff)
-};
-
-#define P(i) d->poly[7 - i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
- uint16x4_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- float32x4_t n, p, q, r, r2, y;
- uint32x4_t u;
- uint16x4_t cmp;
-
- u = vreinterpretq_u32_f32 (x);
- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
-
- /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
- n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vandq_u32 (u, d->mantissa_mask);
- u = vaddq_u32 (u, d->off);
- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
-
- /* y = log(1+r) + n*ln2. */
- r2 = vmulq_f32 (r, r);
- /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
- p = vfmaq_f32 (P (5), P (6), r);
- q = vfmaq_f32 (P (3), P (4), r);
- y = vfmaq_f32 (P (1), P (2), r);
- p = vfmaq_f32 (p, P (7), r2);
- q = vfmaq_f32 (q, p, r2);
- y = vfmaq_f32 (y, q, r2);
- p = vfmaq_f32 (r, d->ln2, n);
-
- if (unlikely (v_any_u16h (cmp)))
- return special_case (x, y, r2, p, cmp);
- return vfmaq_f32 (p, y, r2);
-}
diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h
deleted file mode 100644
index 1dc9916c6fb0..000000000000
--- a/math/aarch64/v_math.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Vector math abstractions.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _V_MATH_H
-#define _V_MATH_H
-
-#if !__aarch64__
-# error "Cannot build without AArch64"
-#endif
-
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-
-#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
-#define V_NAME_D1(fun) _ZGVnN2v_##fun
-#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
-#define V_NAME_D2(fun) _ZGVnN2vv_##fun
-
-#include <stdint.h>
-#include "../math_config.h"
-#include <arm_neon.h>
-
-/* Shorthand helpers for declaring constants. */
-# define V2(X) { X, X }
-# define V4(X) { X, X, X, X }
-# define V8(X) { X, X, X, X, X, X, X, X }
-
-static inline int
-v_any_u16h (uint16x4_t x)
-{
- return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
-}
-
-static inline int
-v_lanes32 (void)
-{
- return 4;
-}
-
-static inline float32x4_t
-v_f32 (float x)
-{
- return (float32x4_t) V4 (x);
-}
-static inline uint32x4_t
-v_u32 (uint32_t x)
-{
- return (uint32x4_t) V4 (x);
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (uint32x4_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
-}
-static inline int
-v_any_u32h (uint32x2_t x)
-{
- return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
-}
-static inline float32x4_t
-v_lookup_f32 (const float *tab, uint32x4_t idx)
-{
- return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline uint32x4_t
-v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
-{
- return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline float32x4_t
-v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
-{
- return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
- p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
-}
-static inline float32x4_t
-v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
- float32x4_t y, uint32x4_t p)
-{
- return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
- p[1] ? f (x1[1], x2[1]) : y[1],
- p[2] ? f (x1[2], x2[2]) : y[2],
- p[3] ? f (x1[3], x2[3]) : y[3]};
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 2;
-}
-static inline float64x2_t
-v_f64 (double x)
-{
- return (float64x2_t) V2 (x);
-}
-static inline uint64x2_t
-v_u64 (uint64_t x)
-{
- return (uint64x2_t) V2 (x);
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (uint64x2_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (x) != 0;
-}
-static inline float64x2_t
-v_lookup_f64 (const double *tab, uint64x2_t idx)
-{
- return (float64x2_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline uint64x2_t
-v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
-{
- return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline float64x2_t
-v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
-{
- double p1 = p[1];
- double x1 = x[1];
- if (likely (p[0]))
- y[0] = f (x[0]);
- if (likely (p1))
- y[1] = f (x1);
- return y;
-}
-
-#endif
diff --git a/math/aarch64/v_pow.c b/math/aarch64/v_pow.c
deleted file mode 100644
index 734f1663a283..000000000000
--- a/math/aarch64/v_pow.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Double-precision vector pow function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
-{
- float64x2_t z;
- for (int lane = 0; lane < v_lanes64 (); lane++)
- {
- double sx = x[lane];
- double sy = y[lane];
- double sz = pow (sx, sy);
- z[lane] = sz;
- }
- return z;
-}
diff --git a/pl/math/v_pow_exp_data.c b/math/aarch64/v_pow_exp_data.c
index 5d921ef648a4..db615ce94bd7 100644
--- a/pl/math/v_pow_exp_data.c
+++ b/math/aarch64/v_pow_exp_data.c
@@ -1,7 +1,7 @@
/*
* Shared data between exp, exp2 and pow.
*
- * Copyright (c) 2018-2023, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/v_pow_log_data.c b/math/aarch64/v_pow_log_data.c
index 036faa5c97c1..7df277f74e4f 100644
--- a/pl/math/v_pow_log_data.c
+++ b/math/aarch64/v_pow_log_data.c
@@ -1,7 +1,7 @@
/*
* Data for the log part of pow.
*
- * Copyright (c) 2018-2023, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c
deleted file mode 100644
index 3a4163ab0558..000000000000
--- a/math/aarch64/v_powf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Single-precision vector powf function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Thresh v_u32 (0x7f000000) /* Max - Min. */
-#define MantissaMask v_u32 (0x007fffff)
-
-#define A data.log2_poly
-#define C data.exp2f_poly
-
-/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
-#define Off v_u32 (0x3f35d000)
-
-#define V_POWF_LOG2_TABLE_BITS 5
-#define V_EXP2F_TABLE_BITS 5
-#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
-#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
-
-static const struct
-{
- struct
- {
- double invc, logc;
- } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
- double log2_poly[4];
- uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
- double exp2f_poly[3];
-} data = {
- .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
- {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
- {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
- {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
- {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
- {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
- {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
- {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
- {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
- {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
- {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
- {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
- {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
- {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
- {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
- {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
- {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
- {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
- {0x1p+0, 0x0p+0 * Scale},
- {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
- {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
- {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
- {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
- {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
- {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
- {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
- {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
- {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
- {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
- {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
- {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
- {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
- .log2_poly = { /* rel err: 1.5 * 2^-30. */
- -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
- -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
- .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
- 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
- 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
- 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
- 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
- 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
- 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
- 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
- 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
- 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
- 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
- .exp2f_poly = { /* rel err: 1.69 * 2^-34. */
- 0x1.c6af84b912394p-5 / Scale / Scale / Scale,
- 0x1.ebfce50fac4f3p-3 / Scale / Scale,
- 0x1.62e42ff0c52d6p-1 / Scale}};
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
-{
- return v_call2_f32 (powf, x, y, ret, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
-{
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
- uint32x4_t tmp = vsubq_u32 (u, Off);
- uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
- Log2IdxMask);
- uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
- uint32x4_t iz = vsubq_u32 (u, top);
- int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
- 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
-
- float32x4_t ret;
- for (int lane = 0; lane < 4; lane++)
- {
- /* Use double precision for each lane. */
- double invc = data.log2_tab[i[lane]].invc;
- double logc = data.log2_tab[i[lane]].logc;
- double z = (double) asfloat (iz[lane]);
-
- /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
- double r = __builtin_fma (z, invc, -1.0);
- double y0 = logc + (double) k[lane];
-
- /* Polynomial to approximate log1p(r)/ln2. */
- double logx = A[0];
- logx = r * logx + A[1];
- logx = r * logx + A[2];
- logx = r * logx + A[3];
- logx = r * logx + y0;
- double ylogx = y[lane] * logx;
- cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
- >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
- ? 1
- : cmp[lane];
-
- /* N*x = k + r with r in [-1/2, 1/2]. */
- double kd = round (ylogx);
- uint64_t ki = lround (ylogx);
- r = ylogx - kd;
-
- /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
- uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
- t += ki << (52 - V_EXP2F_TABLE_BITS);
- double s = asdouble (t);
- double p = C[0];
- p = __builtin_fma (p, r, C[1]);
- p = __builtin_fma (p, r, C[2]);
- p = __builtin_fma (p, s * r, s);
-
- ret[lane] = p;
- }
- if (unlikely (v_any_u32 (cmp)))
- return special_case (x, y, ret, cmp);
- return ret;
-}
diff --git a/pl/math/v_powf_data.c b/math/aarch64/v_powf_data.c
index ded211924b80..5cf1b8769414 100644
--- a/pl/math/v_powf_data.c
+++ b/math/aarch64/v_powf_data.c
@@ -1,7 +1,7 @@
/*
* Coefficients for single-precision SVE pow(x) function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/math/cosf.c b/math/cosf.c
index 6293ce8f1b7d..a9b1f9da16ed 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,7 +1,7 @@
/*
* Single-precision cos function.
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,8 @@
#include <math.h>
#include "math_config.h"
#include "sincosf.h"
+#include "test_defs.h"
+#include "test_sig.h"
/* Fast cosf implementation. Worst-case ULP is 0.5607, maximum relative
error is 0.5303 * 2^-23. A single-step range reduction is used for
@@ -61,3 +63,9 @@ cosf (float y)
else
return __math_invalidf (y);
}
+
+TEST_SIG (S, F, 1, cos, -3.1, 3.1)
+TEST_ULP (cosf, 0.06)
+TEST_ULP_NONNEAREST (cosf, 0.5)
+TEST_INTERVAL (cosf, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (cosf, 0x1p-14, 0x1p54, 50000)
diff --git a/math/erf.c b/math/erf.c
index 5f9f40dda264..2c93a304346a 100644
--- a/math/erf.c
+++ b/math/erf.c
@@ -1,13 +1,15 @@
/*
* Double-precision erf(x) function.
*
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
#include <math.h>
#include <stdint.h>
+#include "test_defs.h"
+#include "test_sig.h"
#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
#define C 0x1.b0ac16p-1
@@ -242,3 +244,11 @@ erf (double x)
return 1.0;
}
}
+
+TEST_SIG (S, D, 1, erf, -6.0, 6.0)
+TEST_ULP (erf, 0.51)
+TEST_ULP_NONNEAREST (erf, 0.9)
+TEST_INTERVAL (erf, 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (erf, 0x1p-1022, 0x1p-26, 40000)
+TEST_SYM_INTERVAL (erf, 0x1p-26, 0x1p3, 40000)
+TEST_INTERVAL (erf, 0, inf, 40000)
diff --git a/math/erff.c b/math/erff.c
index 9fa476dbbab2..fd64f40a2d22 100644
--- a/math/erff.c
+++ b/math/erff.c
@@ -1,13 +1,15 @@
/*
* Single-precision erf(x) function.
*
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2020-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
#include <math.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
#define A __erff_data.erff_poly_A
@@ -102,3 +104,11 @@ erff (float x)
}
return r;
}
+
+TEST_SIG (S, F, 1, erf, -6.0, 6.0)
+TEST_ULP (erff, 0.6)
+TEST_ULP_NONNEAREST (erff, 0.9)
+TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
+TEST_SYM_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
+TEST_INTERVAL (erff, 0, inf, 40000)
diff --git a/math/exp.c b/math/exp.c
index 1de500c31f3e..3b08d44688a8 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -1,7 +1,7 @@
/*
* Double-precision e^x function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,8 @@
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define N (1 << EXP_TABLE_BITS)
#define InvLn2N __exp_data.invln2N
@@ -77,7 +79,7 @@ top12 (double x)
/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
If hastail is 0 then xtail is assumed to be 0 too. */
static inline double
-exp_inline (double x, double xtail, int hastail)
+exp_inline (double x, double xtail)
{
uint32_t abstop;
uint64_t ki, idx, top, sbits;
@@ -125,7 +127,7 @@ exp_inline (double x, double xtail, int hastail)
#endif
r = x + kd * NegLn2hiN + kd * NegLn2loN;
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
- if (hastail)
+ if (!__builtin_constant_p (xtail) || xtail != 0.0)
r += xtail;
/* 2^(k/N) ~= scale * (1 + tail). */
idx = 2 * (ki % N);
@@ -156,21 +158,20 @@ exp_inline (double x, double xtail, int hastail)
double
exp (double x)
{
- return exp_inline (x, 0, 0);
+ return exp_inline (x, 0);
}
-/* May be useful for implementing pow where more than double
- precision input is needed. */
-double
-__exp_dd (double x, double xtail)
-{
- return exp_inline (x, xtail, 1);
-}
#if USE_GLIBC_ABI
strong_alias (exp, __exp_finite)
hidden_alias (exp, __ieee754_exp)
-hidden_alias (__exp_dd, __exp1)
# if LDBL_MANT_DIG == 53
long double expl (long double x) { return exp (x); }
# endif
#endif
+
+TEST_SIG (S, D, 1, exp, -9.9, 9.9)
+TEST_ULP (exp, 0.01)
+TEST_ULP_NONNEAREST (exp, 0.5)
+TEST_INTERVAL (exp, 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (exp, 0x1p-6, 0x1p6, 400000)
+TEST_SYM_INTERVAL (exp, 633.3, 733.3, 10000)
diff --git a/math/exp10.c b/math/exp10.c
index 0fbec4c694ca..de8ece42e09e 100644
--- a/math/exp10.c
+++ b/math/exp10.c
@@ -1,11 +1,13 @@
/*
* Double-precision 10^x function.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define N (1 << EXP_TABLE_BITS)
#define IndexMask (N - 1)
@@ -22,7 +24,7 @@ special_case (uint64_t sbits, double_t tmp, uint64_t ki)
{
double_t scale, y;
- if (ki - (1ull << 16) < 0x80000000)
+ if ((ki & 0x80000000) == 0)
{
/* The exponent of scale might have overflowed by 1. */
sbits -= 1ull << 52;
@@ -84,14 +86,14 @@ exp10 (double x)
/* Reduce x: z = x * N / log10(2), k = round(z). */
double_t z = __exp_data.invlog10_2N * x;
double_t kd;
- int64_t ki;
+ uint64_t ki;
#if TOINT_INTRINSICS
kd = roundtoint (z);
ki = converttoint (z);
#else
kd = eval_as_double (z + Shift);
+ ki = asuint64 (kd);
kd -= Shift;
- ki = kd;
#endif
/* r = x - k * log10(2), r in [-0.5, 0.5]. */
@@ -127,3 +129,15 @@ exp10 (double x)
double_t s = asdouble (sbits);
return eval_as_double (s * y + s);
}
+
+#if WANT_EXP10_TESTS
+TEST_SIG (S, D, 1, exp10, -9.9, 9.9)
+TEST_ULP (exp10, 0.02)
+TEST_ULP_NONNEAREST (exp10, 0.5)
+TEST_SYM_INTERVAL (exp10, 0, 0x1p-47, 5000)
+TEST_SYM_INTERVAL (exp10, 0x1p47, 1, 50000)
+TEST_INTERVAL (exp10, 1, OFlowBound, 50000)
+TEST_INTERVAL (exp10, -1, UFlowBound, 50000)
+TEST_INTERVAL (exp10, OFlowBound, inf, 5000)
+TEST_INTERVAL (exp10, UFlowBound, -inf, 5000)
+#endif
diff --git a/math/exp2.c b/math/exp2.c
index a1eee44f1f48..f26ac3cda2cc 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -1,7 +1,7 @@
/*
* Double-precision 2^x function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,8 @@
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define N (1 << EXP_TABLE_BITS)
#define Shift __exp_data.exp2_shift
@@ -141,3 +143,10 @@ hidden_alias (exp2, __ieee754_exp2)
long double exp2l (long double x) { return exp2 (x); }
# endif
#endif
+
+TEST_SIG (S, D, 1, exp2, -9.9, 9.9)
+TEST_ULP (exp2, 0.01)
+TEST_ULP_NONNEAREST (exp2, 0.5)
+TEST_INTERVAL (exp2, 0, 0xffff000000000000, 10000)
+TEST_SYM_INTERVAL (exp2, 0x1p-6, 0x1p6, 40000)
+TEST_SYM_INTERVAL (exp2, 633.3, 733.3, 10000)
diff --git a/math/exp2f.c b/math/exp2f.c
index 776c3ddf7663..3202f41377ad 100644
--- a/math/exp2f.c
+++ b/math/exp2f.c
@@ -1,13 +1,15 @@
/*
* Single-precision 2^x function.
*
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
/*
EXP2F_TABLE_BITS = 5
@@ -78,3 +80,9 @@ exp2f (float x)
strong_alias (exp2f, __exp2f_finite)
hidden_alias (exp2f, __ieee754_exp2f)
#endif
+
+TEST_SIG (S, F, 1, exp2, -9.9, 9.9)
+TEST_ULP (exp2f, 0.01)
+TEST_ULP_NONNEAREST (exp2f, 0.5)
+TEST_INTERVAL (exp2f, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (exp2f, 0x1p-14, 0x1p8, 50000)
diff --git a/math/expf.c b/math/expf.c
index 08a20d59e491..6572b99a1e68 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -1,13 +1,15 @@
/*
* Single-precision e^x function.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
/*
EXP2F_TABLE_BITS = 5
@@ -89,3 +91,9 @@ expf (float x)
strong_alias (expf, __expf_finite)
hidden_alias (expf, __ieee754_expf)
#endif
+
+TEST_SIG (S, F, 1, exp, -9.9, 9.9)
+TEST_ULP (expf, 0.01)
+TEST_ULP_NONNEAREST (expf, 0.5)
+TEST_INTERVAL (expf, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (expf, 0x1p-14, 0x1p8, 500000)
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 64cbb9c1f850..23d04da99d93 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,58 +1,268 @@
/*
* Public API.
*
- * Copyright (c) 2015-2023, Arm Limited.
+ * Copyright (c) 2015-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _MATHLIB_H
#define _MATHLIB_H
-float expf (float);
-float exp2f (float);
-float logf (float);
-float log2f (float);
-float powf (float, float);
-float sinf (float);
-float cosf (float);
-void sincosf (float, float*, float*);
-
-double exp (double);
-double exp10 (double);
-double exp2 (double);
-double log (double);
-double log2 (double);
-double pow (double, double);
-
#if __aarch64__
-# if __GNUC__ >= 5
-typedef __Float32x4_t __f32x4_t;
-typedef __Float64x2_t __f64x2_t;
-# elif __clang_major__*100+__clang_minor__ >= 305
-typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
-typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-# else
-# error Unsupported compiler
-# endif
+/* Low-accuracy scalar implementations of C23 routines. */
+float arm_math_cospif (float);
+double arm_math_cospi (double);
+float arm_math_sinpif (float);
+double arm_math_sinpi (double);
+float arm_math_tanpif (float);
+double arm_math_tanpi (double);
+void arm_math_sincospif (float, float *, float *);
+void arm_math_sincospi (double, double *, double *);
+#endif
+
+/* SIMD declaration for autovectorisation with fast-math enabled. Only GCC is
+ supported, and vector routines are only supported on Linux on AArch64. */
+#if defined __aarch64__ && __linux__ && defined(__GNUC__) \
+ && !defined(__clang__) && defined(__FAST_MATH__)
+# define DECL_SIMD_aarch64 __attribute__ ((__simd__ ("notinbranch"), const))
+#else
+# define DECL_SIMD_aarch64
+#endif
+
+#if WANT_EXPERIMENTAL_MATH
+
+float arm_math_erff (float);
+DECL_SIMD_aarch64 float cospif (float);
+DECL_SIMD_aarch64 float erfinvf (float);
+DECL_SIMD_aarch64 float sinpif (float);
+DECL_SIMD_aarch64 float tanpif (float);
+
+double arm_math_erf (double);
+DECL_SIMD_aarch64 double cospi (double);
+DECL_SIMD_aarch64 double erfinv (double);
+DECL_SIMD_aarch64 double sinpi (double);
+DECL_SIMD_aarch64 double tanpi (double);
+
+long double erfinvl (long double);
+
+#endif
-# if __GNUC__ >= 9 || __clang_major__ >= 8
-# undef __vpcs
-# define __vpcs __attribute__((__aarch64_vector_pcs__))
+/* Note these routines may not be provided by AOR (some are only available with
+ WANT_EXPERIMENTAL_MATH, some are not provided at all. Redeclare them here to
+ add vector annotations. */
+DECL_SIMD_aarch64 float acosf (float);
+DECL_SIMD_aarch64 float acoshf (float);
+DECL_SIMD_aarch64 float asinf (float);
+DECL_SIMD_aarch64 float asinhf (float);
+DECL_SIMD_aarch64 float atan2f (float, float);
+DECL_SIMD_aarch64 float atanf (float);
+DECL_SIMD_aarch64 float atanhf (float);
+DECL_SIMD_aarch64 float cbrtf (float);
+DECL_SIMD_aarch64 float cosf (float);
+DECL_SIMD_aarch64 float coshf (float);
+DECL_SIMD_aarch64 float erfcf (float);
+DECL_SIMD_aarch64 float erff (float);
+DECL_SIMD_aarch64 float exp10f (float);
+DECL_SIMD_aarch64 float exp2f (float);
+DECL_SIMD_aarch64 float expf (float);
+DECL_SIMD_aarch64 float expm1f (float);
+DECL_SIMD_aarch64 float hypotf (float, float);
+DECL_SIMD_aarch64 float log10f (float);
+DECL_SIMD_aarch64 float log1pf (float);
+DECL_SIMD_aarch64 float log2f (float);
+DECL_SIMD_aarch64 float logf (float);
+DECL_SIMD_aarch64 float powf (float, float);
+DECL_SIMD_aarch64 float sinf (float);
+void sincosf (float, float *, float *);
+DECL_SIMD_aarch64 float sinhf (float);
+DECL_SIMD_aarch64 float tanf (float);
+DECL_SIMD_aarch64 float tanhf (float);
+
+DECL_SIMD_aarch64 double acos (double);
+DECL_SIMD_aarch64 double acosh (double);
+DECL_SIMD_aarch64 double asin (double);
+DECL_SIMD_aarch64 double asinh (double);
+DECL_SIMD_aarch64 double atan2 (double, double);
+DECL_SIMD_aarch64 double atan (double);
+DECL_SIMD_aarch64 double atanh (double);
+DECL_SIMD_aarch64 double cbrt (double);
+DECL_SIMD_aarch64 double cos (double);
+DECL_SIMD_aarch64 double cosh (double);
+DECL_SIMD_aarch64 double erfc (double);
+DECL_SIMD_aarch64 double erf (double);
+DECL_SIMD_aarch64 double exp10 (double);
+DECL_SIMD_aarch64 double exp2 (double);
+DECL_SIMD_aarch64 double exp (double);
+DECL_SIMD_aarch64 double expm1 (double);
+DECL_SIMD_aarch64 double hypot (double, double);
+DECL_SIMD_aarch64 double log10 (double);
+DECL_SIMD_aarch64 double log1p (double);
+DECL_SIMD_aarch64 double log2 (double);
+DECL_SIMD_aarch64 double log (double);
+DECL_SIMD_aarch64 double pow (double, double);
+DECL_SIMD_aarch64 double sin (double);
+DECL_SIMD_aarch64 double sinh (double);
+DECL_SIMD_aarch64 double tan (double);
+DECL_SIMD_aarch64 double tanh (double);
+
+#if __aarch64__ && __linux__
+# include <arm_neon.h>
+# undef __vpcs
+# define __vpcs __attribute__((__aarch64_vector_pcs__))
/* Vector functions following the vector PCS using ABI names. */
-__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
-__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
-__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
-__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
-__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
+__vpcs float32x4_t _ZGVnN4v_acosf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_acoshf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_asinf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_asinhf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_atanf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_atanhf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_cbrtf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_cosf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_coshf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_cospif (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_erfcf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_erff (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_exp10f (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_exp2f (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_exp2f_1u (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_expf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_expf_1u (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_expm1f (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_log10f (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_log1pf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_log2f (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_logf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_sinf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_sinhf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_sinpif (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_tanf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_tanhf (float32x4_t);
+__vpcs float32x4_t _ZGVnN4v_tanpif (float32x4_t);
+__vpcs float32x4_t _ZGVnN4vl4_modff (float32x4_t, float *);
+__vpcs float32x4_t _ZGVnN4vv_atan2f (float32x4_t, float32x4_t);
+__vpcs float32x4_t _ZGVnN4vv_hypotf (float32x4_t, float32x4_t);
+__vpcs float32x4_t _ZGVnN4vv_powf (float32x4_t, float32x4_t);
+__vpcs float32x4x2_t _ZGVnN4v_cexpif (float32x4_t);
+__vpcs void _ZGVnN4vl4l4_sincosf (float32x4_t, float *, float *);
+__vpcs void _ZGVnN4vl4l4_sincospif (float32x4_t, float *, float *);
+
+__vpcs float64x2_t _ZGVnN2v_acos (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_acosh (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_asin (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_asinh (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_atan (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_atanh (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_cbrt (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_cos (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_cosh (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_cospi (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_erf (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_erfc (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_exp (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_exp10 (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_exp2 (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_expm1 (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_log (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_log10 (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_log1p (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_log2 (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_sin (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_sinh (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_sinpi (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_tan (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_tanh (float64x2_t);
+__vpcs float64x2_t _ZGVnN2v_tanpi (float64x2_t);
+__vpcs float64x2_t _ZGVnN2vl8_modf (float64x2_t, double *);
+__vpcs float64x2_t _ZGVnN2vv_atan2 (float64x2_t, float64x2_t);
+__vpcs float64x2_t _ZGVnN2vv_hypot (float64x2_t, float64x2_t);
+__vpcs float64x2_t _ZGVnN2vv_pow (float64x2_t, float64x2_t);
+__vpcs float64x2x2_t _ZGVnN2v_cexpi (float64x2_t);
+__vpcs void _ZGVnN2vl8l8_sincos (float64x2_t, double *, double *);
+__vpcs void _ZGVnN2vl8l8_sincospi (float64x2_t, double *, double *);
+
+# if WANT_EXPERIMENTAL_MATH
+__vpcs float32x4_t _ZGVnN4v_erfinvf (float32x4_t);
+__vpcs float64x2_t _ZGVnN2v_erfinv (float64x2_t);
+# endif
+
+# include <arm_sve.h>
+svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanpif (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxvl4_modff (svfloat32_t, float *, svbool_t);
+svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t);
+svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t);
+void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t);
+void _ZGVsMxvl4l4_sincospif (svfloat32_t, float *, float *, svbool_t);
+
+svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_tanpi (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxvl8_modf (svfloat64_t, double *, svbool_t);
+svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t);
+svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t);
+void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t);
+void _ZGVsMxvl8l8_sincospi (svfloat64_t, double *, double *, svbool_t);
+
+# if WANT_EXPERIMENTAL_MATH
+
+svfloat32_t _ZGVsMxv_erfinvf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t);
+
+svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t);
+svfloat64_t _ZGVsMxv_erfinv (svfloat64_t, svbool_t);
+
# endif
#endif
diff --git a/math/include/test_defs.h b/math/include/test_defs.h
new file mode 100644
index 000000000000..2fe66fa6f14c
--- /dev/null
+++ b/math/include/test_defs.h
@@ -0,0 +1,21 @@
+/*
+ * Helper macros for emitting various details about routines for consumption by
+ * runulp.sh. This version of the file is for inclusion when building routines,
+ * so expansions are empty - see math/test/test_defs for versions used by the
+ * build system.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+#define TEST_ULP(f, l)
+#define TEST_ULP_NONNEAREST(f, l)
+
+#define TEST_DISABLE_FENV(f)
+#define TEST_DISABLE_FENV_IF_NOT(f, e)
+
+#define TEST_INTERVAL(f, lo, hi, n)
+#define TEST_SYM_INTERVAL(f, lo, hi, n)
+#define TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n)
+
+#define TEST_CONTROL_VALUE(f, c)
diff --git a/math/include/test_sig.h b/math/include/test_sig.h
new file mode 100644
index 000000000000..a967829098d6
--- /dev/null
+++ b/math/include/test_sig.h
@@ -0,0 +1,47 @@
+/*
+ * Macros for emitting various ulp/bench entries based on function signature
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+#define TEST_DECL_SF1(fun) float fun##f (float);
+#define TEST_DECL_SF2(fun) float fun##f (float, float);
+#define TEST_DECL_SD1(fun) double fun (double);
+#define TEST_DECL_SD2(fun) double fun (double, double);
+
+#define TEST_DECL_VF1(fun) \
+ float32x4_t VPCS_ATTR V_NAME_F1 (fun##f) (float32x4_t);
+#define TEST_DECL_VF2(fun) \
+ float32x4_t VPCS_ATTR V_NAME_F2 (fun##f) (float32x4_t, float32x4_t);
+#define TEST_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t);
+#define TEST_DECL_VD2(fun) \
+ VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t);
+
+#define TEST_DECL_SVF1(fun) \
+ svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t);
+#define TEST_DECL_SVF2(fun) \
+ svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t);
+#define TEST_DECL_SVD1(fun) \
+ svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t);
+#define TEST_DECL_SVD2(fun) \
+ svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t);
+
+/* For building the routines, emit function prototype from TEST_SIG. This
+ ensures that the correct signature has been chosen (wrong one will be a
+ compile error). TEST_SIG is defined differently by various components of the
+ build system to emit entries in the wrappers and entries for mathbench and
+ ulp. */
+#ifndef _TEST_SIG
+# if defined(EMIT_ULP_FUNCS)
+# define _TEST_SIG(v, t, a, f, ...) TEST_SIG _Z##v##t##a (f)
+# elif defined(EMIT_ULP_WRAPPERS)
+# define _TEST_SIG(v, t, a, f, ...) TEST_SIG Z##v##N##t##a##_WRAP (f)
+# elif defined(EMIT_MATHBENCH_FUNCS)
+# define _TEST_SIG(v, t, a, f, ...) TEST_SIG _Z##v##t##a (f, ##__VA_ARGS__)
+# else
+# define _TEST_SIG(v, t, a, f, ...) TEST_DECL_##v##t##a (f)
+# endif
+#endif
+
+#define TEST_SIG(...) _TEST_SIG (__VA_ARGS__)
diff --git a/math/log.c b/math/log.c
index 43dfc2a744f0..1d6244c30b79 100644
--- a/math/log.c
+++ b/math/log.c
@@ -1,7 +1,7 @@
/*
* Double-precision log(x) function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,8 @@
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define T __log_data.tab
#define T2 __log_data.tab2
@@ -160,3 +162,10 @@ hidden_alias (log, __ieee754_log)
long double logl (long double x) { return log (x); }
# endif
#endif
+
+TEST_SIG (S, D, 1, log, 0.01, 11.1)
+TEST_ULP (log, 0.02)
+TEST_ULP_NONNEAREST (log, 0.5)
+TEST_INTERVAL (log, 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (log, 0x1p-4, 0x1p4, 400000)
+TEST_INTERVAL (log, 0, inf, 400000)
diff --git a/pl/math/log10f.c b/math/log10f.c
index 5c80008e4e57..f8561d063107 100644
--- a/pl/math/log10f.c
+++ b/math/log10f.c
@@ -1,7 +1,7 @@
/*
* Single-precision log10 function.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,8 +9,8 @@
#include <stdint.h>
#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
+#include "test_sig.h"
+#include "test_defs.h"
/* Data associated to logf:
@@ -30,7 +30,8 @@
/* This naive implementation of log10f mimics that of log
then simply scales the result by 1/log(10) to switch from base e to
base 10. Hence, most computations are carried out in double precision.
- Scaling before rounding to single precision is both faster and more accurate.
+ Scaling before rounding to single precision is both faster and more
+ accurate.
ULP error: 0.797 ulp (nearest rounding.). */
float
@@ -88,10 +89,11 @@ log10f (float x)
return eval_as_float (y);
}
-PL_SIG (S, F, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (log10f, 0.30)
-PL_TEST_INTERVAL (log10f, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000)
-PL_TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000)
-PL_TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000)
-PL_TEST_INTERVAL (log10f, 0, inf, 50000)
+TEST_SIG (S, F, 1, log10, 0.01, 11.1)
+TEST_ULP (log10f, 0.30)
+TEST_ULP_NONNEAREST (log10f, 0.5)
+TEST_INTERVAL (log10f, 0, 0xffff0000, 10000)
+TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000)
+TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000)
+TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000)
+TEST_INTERVAL (log10f, 0, inf, 50000)
diff --git a/math/log2.c b/math/log2.c
index 3f9c21b03962..6462915a24f0 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -1,7 +1,7 @@
/*
* Double-precision log2(x) function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,8 @@
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
#define T __log2_data.tab
#define T2 __log2_data.tab2
@@ -139,3 +141,10 @@ hidden_alias (log2, __ieee754_log2)
long double log2l (long double x) { return log2 (x); }
# endif
#endif
+
+TEST_SIG (S, D, 1, log2, 0.01, 11.1)
+TEST_ULP (log2, 0.05)
+TEST_ULP_NONNEAREST (log2, 0.5)
+TEST_INTERVAL (log2, 0, 0xffff000000000000, 10000)
+TEST_INTERVAL (log2, 0x1p-4, 0x1p4, 40000)
+TEST_INTERVAL (log2, 0, inf, 40000)
diff --git a/math/log2f.c b/math/log2f.c
index 0a44fa2024f6..7d47379b41cb 100644
--- a/math/log2f.c
+++ b/math/log2f.c
@@ -1,13 +1,15 @@
/*
* Single-precision log2 function.
*
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
/*
LOG2F_TABLE_BITS = 4
@@ -78,3 +80,10 @@ log2f (float x)
strong_alias (log2f, __log2f_finite)
hidden_alias (log2f, __ieee754_log2f)
#endif
+
+TEST_SIG (S, F, 1, log2, 0.01, 11.1)
+TEST_ULP (log2f, 0.26)
+TEST_ULP_NONNEAREST (log2f, 0.5)
+TEST_INTERVAL (log2f, 0, 0xffff0000, 10000)
+TEST_INTERVAL (log2f, 0x1p-4, 0x1p4, 50000)
+TEST_INTERVAL (log2f, 0, inf, 50000)
diff --git a/math/logf.c b/math/logf.c
index 820f74c3e66a..f2c26deaff19 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,13 +1,15 @@
/*
* Single-precision log function.
*
- * Copyright (c) 2017-2023, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
+#include "test_sig.h"
/*
LOGF_TABLE_BITS = 4
@@ -77,3 +79,10 @@ logf (float x)
strong_alias (logf, __logf_finite)
hidden_alias (logf, __ieee754_logf)
#endif
+
+TEST_SIG (S, F, 1, log, 0.01, 11.1)
+TEST_ULP (logf, 0.32)
+TEST_ULP_NONNEAREST (logf, 0.5)
+TEST_INTERVAL (logf, 0, 0xffff0000, 10000)
+TEST_INTERVAL (logf, 0x1p-4, 0x1p4, 500000)
+TEST_INTERVAL (logf, 0, inf, 50000)
diff --git a/math/logf_data.c b/math/logf_data.c
index 04247684755f..5c301a90af8e 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -1,7 +1,7 @@
/*
* Data definition for logf.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -27,6 +27,7 @@ const struct logf_data __logf_data = {
{ 0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2 },
},
.ln2 = 0x1.62e42fefa39efp-1,
+ .invln10 = 0x1.bcb7b1526e50ep-2,
.poly = {
-0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2,
}
diff --git a/math/math_config.h b/math/math_config.h
index faf77b31fc99..0fc653f93761 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,7 +1,7 @@
/*
* Configuration for math routines.
*
- * Copyright (c) 2017-2023, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -13,9 +13,9 @@
#ifndef WANT_ROUNDING
/* If defined to 1, return correct results for special cases in non-nearest
- rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
- This may be set to 0 if there is no fenv support or if math functions only
- get called in round to nearest mode. */
+ rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than
+ -0.0f). This may be set to 0 if there is no fenv support or if math
+ functions only get called in round to nearest mode. */
# define WANT_ROUNDING 1
#endif
#ifndef WANT_ERRNO
@@ -117,6 +117,25 @@
#define __math_check_oflowf arm_math_check_oflowf
#define __math_check_uflowf arm_math_check_uflowf
+#define __exp_data arm_math_exp_data
+#define __asin_poly arm_math_asin_poly
+#define __asinf_poly arm_math_asinf_poly
+#define __asinh_data arm_math_asinh_data
+#define __asinhf_data arm_math_asinhf_data
+#define __atan_poly_data arm_math_atan_poly_data
+#define __atanf_poly_data arm_math_atanf_poly_data
+#define __cbrt_data arm_math_cbrt_data
+#define __cbrtf_data arm_math_cbrtf_data
+#define __erf_data arm_math_erf_data
+#define __expf_data arm_math_expf_data
+#define __expm1_poly arm_math_expm1_poly
+#define __expm1f_poly arm_math_expm1f_poly
+#define __log10_data arm_math_log10_data
+#define __log1p_data arm_math_log1p_data
+#define __log1pf_data arm_math_log1pf_data
+#define __log_data arm_math_log_data
+#define __tanf_poly_data arm_math_tanf_poly_data
+#define __v_log_data arm_math_v_log_data
#define __sincosf_table arm_math_sincosf_table
#define __inv_pio4 arm_math_inv_pio4
#define __exp2f_data arm_math_exp2f_data
@@ -131,6 +150,25 @@
#define __erf_data arm_math_erf_data
#define __v_exp_data arm_math_v_exp_data
#define __v_log_data arm_math_v_log_data
+#define __v_erf_data arm_math_v_erf_data
+#define __v_erfc_data arm_math_v_erfc_data
+#define __v_erfcf_data arm_math_v_erfcf_data
+#define __v_erff_data arm_math_v_erff_data
+#define __v_exp_tail_data arm_math_v_exp_tail_data
+#define __v_log10_data arm_math_v_log10_data
+#define __v_log2_data arm_math_v_log2_data
+#define __v_pow_exp_data arm_math_v_pow_exp_data
+#define __v_pow_log_data arm_math_v_pow_log_data
+#define __v_powf_data arm_math_v_powf_data
+
+/* On some platforms (in particular Windows) INFINITY and HUGE_VAL might
+ be defined in such a way that might not produce the expected bit pattern,
+ therefore we enforce the glibc math.h definition using a builtin that is
+ supported in both gcc and clang. */
+#if defined (_WIN32) && (defined (__GNUC__) || defined (__clang__))
+# undef INFINITY
+# define INFINITY __builtin_inff()
+#endif
#if HAVE_FAST_ROUND
/* When set, the roundtoint and converttoint functions are provided with
@@ -365,11 +403,12 @@ extern const struct exp2f_data
uint64_t tab[1 << EXP2F_TABLE_BITS];
double shift_scaled;
double poly[EXP2F_POLY_ORDER];
- double shift;
double invln2_scaled;
double poly_scaled[EXP2F_POLY_ORDER];
+ double shift;
} __exp2f_data HIDDEN;
+/* Data for logf and log10f. */
#define LOGF_TABLE_BITS 4
#define LOGF_POLY_ORDER 4
extern const struct logf_data
@@ -379,6 +418,7 @@ extern const struct logf_data
double invc, logc;
} tab[1 << LOGF_TABLE_BITS];
double ln2;
+ double invln10;
double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */
} __logf_data HIDDEN;
@@ -427,17 +467,19 @@ extern const struct powf_log2_data
extern const struct exp_data
{
double invln2N;
- double invlog10_2N;
- double shift;
double negln2hiN;
double negln2loN;
- double neglog10_2hiN;
- double neglog10_2loN;
double poly[4]; /* Last four coefficients. */
+ double shift;
+
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
+
+ double neglog10_2hiN;
+ double neglog10_2loN;
double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
+ double invlog10_2N;
} __exp_data HIDDEN;
#define LOG_TABLE_BITS 7
@@ -509,13 +551,214 @@ extern const struct erf_data
#define V_EXP_TABLE_BITS 7
extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+#define V_LOG_POLY_ORDER 6
#define V_LOG_TABLE_BITS 7
extern const struct v_log_data
{
+ /* Shared data for vector log and log-derived routines (e.g. asinh). */
+ double poly[V_LOG_POLY_ORDER - 1];
+ double ln2;
struct
{
double invc, logc;
} table[1 << V_LOG_TABLE_BITS];
} __v_log_data HIDDEN;
+/* Some data for SVE powf's internal exp and log. */
+#define V_POWF_EXP2_TABLE_BITS 5
+#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS)
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS)
+extern const struct v_powf_data
+{
+ double invc[V_POWF_LOG2_N];
+ double logc[V_POWF_LOG2_N];
+ uint64_t scale[V_POWF_EXP2_N];
+} __v_powf_data HIDDEN;
+
+/* Some data for AdvSIMD and SVE pow's internal exp and log. */
+#define V_POW_EXP_TABLE_BITS 8
+extern const struct v_pow_exp_data
+{
+ double poly[3];
+ double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift;
+ uint64_t sbits[1 << V_POW_EXP_TABLE_BITS];
+} __v_pow_exp_data HIDDEN;
+
+#define V_POW_LOG_TABLE_BITS 7
+extern const struct v_pow_log_data
+{
+ double poly[7]; /* First coefficient is 1. */
+ double ln2_hi, ln2_lo;
+ double invc[1 << V_POW_LOG_TABLE_BITS];
+ double logc[1 << V_POW_LOG_TABLE_BITS];
+ double logctail[1 << V_POW_LOG_TABLE_BITS];
+} __v_pow_log_data HIDDEN;
+
+#define V_LOG2_TABLE_BITS 7
+extern const struct v_log2_data
+{
+ double poly[5];
+ double invln2;
+ struct
+ {
+ double invc, log2c;
+ } table[1 << V_LOG2_TABLE_BITS];
+} __v_log2_data HIDDEN;
+
+#define V_LOG10_TABLE_BITS 7
+extern const struct v_log10_data
+{
+ double poly[5];
+ double invln10, log10_2;
+ struct
+ {
+ double invc, log10c;
+ } table[1 << V_LOG10_TABLE_BITS];
+} __v_log10_data HIDDEN;
+
+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
+
+extern const struct v_erff_data
+{
+ struct
+ {
+ float erf, scale;
+ } tab[513];
+} __v_erff_data HIDDEN;
+
+extern const struct v_erfcf_data
+{
+ struct
+ {
+ float erfc, scale;
+ } tab[645];
+} __v_erfcf_data HIDDEN;
+
+extern const struct v_erf_data
+{
+ struct
+ {
+ double erf, scale;
+ } tab[769];
+} __v_erf_data HIDDEN;
+
+extern const struct v_erfc_data
+{
+ struct
+ {
+ double erfc, scale;
+ } tab[3488];
+} __v_erfc_data HIDDEN;
+
+/* Table with 4/PI to 192 bit precision. */
+extern const uint32_t __inv_pio4[] HIDDEN;
+
+#if WANT_EXPERIMENTAL_MATH
+
+# define LOG1P_NCOEFFS 19
+extern const struct log1p_data
+{
+ double coeffs[LOG1P_NCOEFFS];
+} __log1p_data HIDDEN;
+
+# define LOG1PF_2U5
+# define LOG1PF_NCOEFFS 9
+extern const struct log1pf_data
+{
+ float coeffs[LOG1PF_NCOEFFS];
+} __log1pf_data HIDDEN;
+
+# define ASINF_POLY_ORDER 4
+extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN;
+
+# define ASIN_POLY_ORDER 11
+extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN;
+
+# define ASINHF_NCOEFFS 8
+extern const struct asinhf_data
+{
+ float coeffs[ASINHF_NCOEFFS];
+} __asinhf_data HIDDEN;
+
+# define ASINH_NCOEFFS 18
+extern const struct asinh_data
+{
+ double poly[ASINH_NCOEFFS];
+} __asinh_data HIDDEN;
+
+# define ATAN_POLY_NCOEFFS 20
+extern const struct atan_poly_data
+{
+ double poly[ATAN_POLY_NCOEFFS];
+} __atan_poly_data HIDDEN;
+
+# define ATANF_POLY_NCOEFFS 8
+extern const struct atanf_poly_data
+{
+ float poly[ATANF_POLY_NCOEFFS];
+} __atanf_poly_data HIDDEN;
+
+extern const struct cbrtf_data
+{
+ float poly[4];
+ float table[5];
+} __cbrtf_data HIDDEN;
+
+extern const struct cbrt_data
+{
+ double poly[4];
+ double table[5];
+} __cbrt_data HIDDEN;
+
+# define EXPF_TABLE_BITS 5
+# define EXPF_POLY_ORDER 3
+extern const struct expf_data
+{
+ uint64_t tab[1 << EXPF_TABLE_BITS];
+ double invln2_scaled;
+ double poly_scaled[EXPF_POLY_ORDER];
+} __expf_data HIDDEN;
+
+# define EXPM1F_POLY_ORDER 5
+extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
+
+# define EXPM1_POLY_ORDER 11
+extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
+
+/* Data for low accuracy log10 (with 1/ln(10) included in coefficients). */
+# define LOG10_TABLE_BITS 7
+# define LOG10_POLY_ORDER 6
+# define LOG10_POLY1_ORDER 12
+extern const struct log10_data
+{
+ double ln2hi;
+ double ln2lo;
+ double invln10;
+ double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */
+ double poly1[LOG10_POLY1_ORDER - 1];
+ struct
+ {
+ double invc, logc;
+ } tab[1 << LOG10_TABLE_BITS];
+# if !HAVE_FAST_FMA
+ struct
+ {
+ double chi, clo;
+ } tab2[1 << LOG10_TABLE_BITS];
+# endif
+} __log10_data HIDDEN;
+
+# define TANF_P_POLY_NCOEFFS 6
+/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps. */
+# define TANF_Q_POLY_NCOEFFS 4
+extern const struct tanf_poly_data
+{
+ float poly_tan[TANF_P_POLY_NCOEFFS];
+ float poly_cotan[TANF_Q_POLY_NCOEFFS];
+} __tanf_poly_data HIDDEN;
+
+#endif /* WANT_EXPERIMENTAL_MATH. */
+
#endif
diff --git a/pl/math/poly_generic.h b/math/poly_generic.h
index 3fc25f8762f2..c21b61aad4c3 100644
--- a/pl/math/poly_generic.h
+++ b/math/poly_generic.h
@@ -1,7 +1,7 @@
/*
* Generic helpers for evaluating polynomials with various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
diff --git a/pl/math/poly_scalar_f32.h b/math/poly_scalar_f32.h
index a9b1c5544494..198e5801938a 100644
--- a/pl/math/poly_scalar_f32.h
+++ b/math/poly_scalar_f32.h
@@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on siongle-precision scalar input, using
* various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_POLY_SCALAR_F32_H
-#define PL_MATH_POLY_SCALAR_F32_H
+#ifndef MATH_POLY_SCALAR_F32_H
+#define MATH_POLY_SCALAR_F32_H
#include <math.h>
diff --git a/pl/math/poly_scalar_f64.h b/math/poly_scalar_f64.h
index 207dccee30ad..6fbebe05d1df 100644
--- a/pl/math/poly_scalar_f64.h
+++ b/math/poly_scalar_f64.h
@@ -2,12 +2,12 @@
* Helpers for evaluating polynomials on double-precision scalar input, using
* various schemes.
*
- * Copyright (c) 2023, Arm Limited.
+ * Copyright (c) 2023-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifndef PL_MATH_POLY_SCALAR_F64_H
-#define PL_MATH_POLY_SCALAR_F64_H
+#ifndef MATH_POLY_SCALAR_F64_H
+#define MATH_POLY_SCALAR_F64_H
#include <math.h>
diff --git a/math/pow.c b/math/pow.c
index af719fe5ab10..1983bb2bbeba 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -1,7 +1,7 @@
/*
* Double-precision x^y function.
*
- * Copyright (c) 2018-2020, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,7 @@
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
/*
Worst-case error: 0.54 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53)
@@ -378,3 +379,22 @@ hidden_alias (pow, __ieee754_pow)
long double powl (long double x, long double y) { return pow (x, y); }
# endif
#endif
+
+TEST_ULP (pow, 0.05)
+TEST_ULP_NONNEAREST (pow, 0.5)
+TEST_INTERVAL2 (pow, 0.5, 2.0, 0, inf, 20000)
+TEST_INTERVAL2 (pow, -0.5, -2.0, 0, inf, 20000)
+TEST_INTERVAL2 (pow, 0.5, 2.0, -0, -inf, 20000)
+TEST_INTERVAL2 (pow, -0.5, -2.0, -0, -inf, 20000)
+TEST_INTERVAL2 (pow, 0.5, 2.0, 0x1p-10, 0x1p10, 40000)
+TEST_INTERVAL2 (pow, 0.5, 2.0, -0x1p-10, -0x1p10, 40000)
+TEST_INTERVAL2 (pow, 0, inf, 0.5, 2.0, 80000)
+TEST_INTERVAL2 (pow, 0, inf, -0.5, -2.0, 80000)
+TEST_INTERVAL2 (pow, 0x1.fp-1, 0x1.08p0, 0x1p8, 0x1p17, 80000)
+TEST_INTERVAL2 (pow, 0x1.fp-1, 0x1.08p0, -0x1p8, -0x1p17, 80000)
+TEST_INTERVAL2 (pow, 0, 0x1p-1000, 0, 1.0, 50000)
+TEST_INTERVAL2 (pow, 0x1p1000, inf, 0, 1.0, 50000)
+TEST_INTERVAL2 (pow, 0x1.ffffffffffff0p-1, 0x1.0000000000008p0, 0x1p60, 0x1p68,
+ 50000)
+TEST_INTERVAL2 (pow, 0x1.ffffffffff000p-1, 0x1p0, 0x1p50, 0x1p52, 50000)
+TEST_INTERVAL2 (pow, -0x1.ffffffffff000p-1, -0x1p0, 0x1p50, 0x1p52, 50000)
diff --git a/math/powf.c b/math/powf.c
index 05c80bb2eb67..3f3f41ca276a 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -1,13 +1,14 @@
/*
* Single-precision pow function.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
#include <stdint.h>
#include "math_config.h"
+#include "test_defs.h"
/*
POWF_LOG2_POLY_ORDER = 5
@@ -219,3 +220,12 @@ powf (float x, float y)
strong_alias (powf, __powf_finite)
hidden_alias (powf, __ieee754_powf)
#endif
+
+TEST_ULP (powf, 0.4)
+TEST_ULP_NONNEAREST (powf, 0.5)
+TEST_INTERVAL2 (powf, 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000)
+TEST_INTERVAL2 (powf, 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000)
+TEST_INTERVAL2 (powf, 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000)
+TEST_INTERVAL2 (powf, 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000)
+TEST_INTERVAL2 (powf, 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000)
+TEST_INTERVAL2 (powf, 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000)
diff --git a/math/sincosf.c b/math/sincosf.c
index 446f21d60faf..05a71d78bb1e 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,7 +1,7 @@
/*
* Single-precision sin/cos function.
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -9,6 +9,7 @@
#include <math.h>
#include "math_config.h"
#include "sincosf.h"
+#include "test_defs.h"
/* Fast sincosf implementation. Worst-case ULP is 0.5607, maximum relative
error is 0.5303 * 2^-23. A single-step range reduction is used for
@@ -77,3 +78,12 @@ sincosf (float y, float *sinp, float *cosp)
#endif
}
}
+
+TEST_ULP (sincosf_sinf, 0.06)
+TEST_ULP (sincosf_cosf, 0.06)
+TEST_ULP_NONNEAREST (sincosf_sinf, 0.5)
+TEST_ULP_NONNEAREST (sincosf_cosf, 0.5)
+TEST_INTERVAL (sincosf_sinf, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (sincosf_sinf, 0x1p-14, 0x1p54, 50000)
+TEST_INTERVAL (sincosf_cosf, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (sincosf_cosf, 0x1p-14, 0x1p54, 50000)
diff --git a/math/sincosf.h b/math/sincosf.h
index ec23ed7aeb26..912def33d295 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -1,7 +1,7 @@
/*
* Header for sinf, cosf and sincosf.
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -27,9 +27,6 @@ typedef struct
/* Polynomial data (the cosine polynomial is negated in the 2nd entry). */
extern const sincos_t __sincosf_table[2] HIDDEN;
-/* Table with 4/PI to 192 bit precision. */
-extern const uint32_t __inv_pio4[] HIDDEN;
-
/* Top 12 bits of the float representation with the sign bit cleared. */
static inline uint32_t
abstop12 (float x)
diff --git a/math/sinf.c b/math/sinf.c
index 8dd8ae458794..e244e115d32b 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,13 +1,15 @@
/*
* Single-precision sin function.
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
#include "math_config.h"
#include "sincosf.h"
+#include "test_defs.h"
+#include "test_sig.h"
/* Fast sinf implementation. Worst-case ULP is 0.5607, maximum relative
error is 0.5303 * 2^-23. A single-step range reduction is used for
@@ -65,3 +67,9 @@ sinf (float y)
else
return __math_invalidf (y);
}
+
+TEST_SIG (S, F, 1, sin, -3.1, 3.1)
+TEST_ULP (sinf, 0.06)
+TEST_ULP_NONNEAREST (sinf, 0.5)
+TEST_INTERVAL (sinf, 0, 0xffff0000, 10000)
+TEST_SYM_INTERVAL (sinf, 0x1p-14, 0x1p54, 50000)
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index ed7e89bb7710..653c58fbc484 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,10 +1,23 @@
/*
* Microbenchmark for math functions.
*
- * Copyright (c) 2018-2023, Arm Limited.
+ * Copyright (c) 2018-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#if WANT_SVE_TESTS
+# if __aarch64__ && __linux__
+# ifdef __clang__
+# pragma clang attribute push(__attribute__((target("sve"))), \
+ apply_to = any(function))
+# else
+# pragma GCC target("+sve")
+# endif
+# else
+# error "SVE not supported - please disable WANT_SVE_TESTS"
+# endif
+#endif
+
#undef _GNU_SOURCE
#define _GNU_SOURCE 1
#include <stdint.h>
@@ -29,94 +42,6 @@ static float Af[N];
static long measurecount = MEASURE;
static long itercount = ITER;
-#ifdef __vpcs
-#include <arm_neon.h>
-typedef float64x2_t v_double;
-
-#define v_double_len() 2
-
-static inline v_double
-v_double_load (const double *p)
-{
- return (v_double){p[0], p[1]};
-}
-
-static inline v_double
-v_double_dup (double x)
-{
- return (v_double){x, x};
-}
-
-typedef float32x4_t v_float;
-
-#define v_float_len() 4
-
-static inline v_float
-v_float_load (const float *p)
-{
- return (v_float){p[0], p[1], p[2], p[3]};
-}
-
-static inline v_float
-v_float_dup (float x)
-{
- return (v_float){x, x, x, x};
-}
-#else
-/* dummy definitions to make things compile. */
-typedef double v_double;
-typedef float v_float;
-#define v_double_len(x) 1
-#define v_double_load(x) (x)[0]
-#define v_double_dup(x) (x)
-#define v_float_len(x) 1
-#define v_float_load(x) (x)[0]
-#define v_float_dup(x) (x)
-
-#endif
-
-#if WANT_SVE_MATH
-#include <arm_sve.h>
-typedef svbool_t sv_bool;
-typedef svfloat64_t sv_double;
-
-#define sv_double_len() svcntd()
-
-static inline sv_double
-sv_double_load (const double *p)
-{
- svbool_t pg = svptrue_b64();
- return svld1(pg, p);
-}
-
-static inline sv_double
-sv_double_dup (double x)
-{
- return svdup_n_f64(x);
-}
-
-typedef svfloat32_t sv_float;
-
-#define sv_float_len() svcntw()
-
-static inline sv_float
-sv_float_load (const float *p)
-{
- svbool_t pg = svptrue_b32();
- return svld1(pg, p);
-}
-
-static inline sv_float
-sv_float_dup (float x)
-{
- return svdup_n_f32(x);
-}
-#else
-/* dummy definitions to make things compile. */
-#define sv_double_len(x) 1
-#define sv_float_len(x) 1
-#endif
-
static double
dummy (double x)
{
@@ -128,28 +53,28 @@ dummyf (float x)
{
return x;
}
-#ifdef __vpcs
-__vpcs static v_double
-__vn_dummy (v_double x)
+#if __aarch64__ && __linux__
+__vpcs static float64x2_t
+__vn_dummy (float64x2_t x)
{
return x;
}
-__vpcs static v_float
-__vn_dummyf (v_float x)
+__vpcs static float32x4_t
+__vn_dummyf (float32x4_t x)
{
return x;
}
#endif
-#if WANT_SVE_MATH
-static sv_double
-__sv_dummy (sv_double x, sv_bool pg)
+#if WANT_SVE_TESTS
+static svfloat64_t
+__sv_dummy (svfloat64_t x, svbool_t pg)
{
return x;
}
-static sv_float
-__sv_dummyf (sv_float x, sv_bool pg)
+static svfloat32_t
+__sv_dummyf (svfloat32_t x, svbool_t pg)
{
return x;
}
@@ -169,16 +94,17 @@ static const struct fun
{
double (*d) (double);
float (*f) (float);
-#ifdef __vpcs
- __vpcs v_double (*vnd) (v_double);
- __vpcs v_float (*vnf) (v_float);
+#if __aarch64__ && __linux__
+ __vpcs float64x2_t (*vnd) (float64x2_t);
+ __vpcs float32x4_t (*vnf) (float32x4_t);
#endif
-#if WANT_SVE_MATH
- sv_double (*svd) (sv_double, sv_bool);
- sv_float (*svf) (sv_float, sv_bool);
+#if WANT_SVE_TESTS
+ svfloat64_t (*svd) (svfloat64_t, svbool_t);
+ svfloat32_t (*svf) (svfloat32_t, svbool_t);
#endif
} fun;
} funtab[] = {
+// clang-format off
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
@@ -187,11 +113,11 @@ static const struct fun
#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
D (dummy, 1.0, 2.0)
F (dummyf, 1.0, 2.0)
-#ifdef __vpcs
+#if __aarch64__ && __linux__
VND (__vn_dummy, 1.0, 2.0)
VNF (__vn_dummyf, 1.0, 2.0)
#endif
-#if WANT_SVE_MATH
+#if WANT_SVE_TESTS
SVD (__sv_dummy, 1.0, 2.0)
SVF (__sv_dummyf, 1.0, 2.0)
#endif
@@ -203,6 +129,7 @@ SVF (__sv_dummyf, 1.0, 2.0)
#undef VND
#undef SVF
#undef SVD
+ // clang-format on
};
static void
@@ -301,75 +228,77 @@ runf_latency (float f (float))
prev = f (Af[i] + prev * z);
}
-#ifdef __vpcs
+#if __aarch64__ && __linux__
static void
-run_vn_thruput (__vpcs v_double f (v_double))
+run_vn_thruput (__vpcs float64x2_t f (float64x2_t))
{
- for (int i = 0; i < N; i += v_double_len ())
- f (v_double_load (A+i));
+ for (int i = 0; i < N; i += 2)
+ f (vld1q_f64 (A + i));
}
static void
-runf_vn_thruput (__vpcs v_float f (v_float))
+runf_vn_thruput (__vpcs float32x4_t f (float32x4_t))
{
- for (int i = 0; i < N; i += v_float_len ())
- f (v_float_load (Af+i));
+ for (int i = 0; i < N; i += 4)
+ f (vld1q_f32 (Af + i));
}
static void
-run_vn_latency (__vpcs v_double f (v_double))
+run_vn_latency (__vpcs float64x2_t f (float64x2_t))
{
volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
uint64x2_t sel = vsel;
- v_double prev = v_double_dup (0);
- for (int i = 0; i < N; i += v_double_len ())
- prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
+ float64x2_t prev = vdupq_n_f64 (0);
+ for (int i = 0; i < N; i += 2)
+ prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i)));
}
static void
-runf_vn_latency (__vpcs v_float f (v_float))
+runf_vn_latency (__vpcs float32x4_t f (float32x4_t))
{
volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
uint32x4_t sel = vsel;
- v_float prev = v_float_dup (0);
- for (int i = 0; i < N; i += v_float_len ())
- prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
+ float32x4_t prev = vdupq_n_f32 (0);
+ for (int i = 0; i < N; i += 4)
+ prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i)));
}
#endif
-#if WANT_SVE_MATH
+#if WANT_SVE_TESTS
static void
-run_sv_thruput (sv_double f (sv_double, sv_bool))
+run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t))
{
- for (int i = 0; i < N; i += sv_double_len ())
- f (sv_double_load (A+i), svptrue_b64 ());
+ for (int i = 0; i < N; i += svcntd ())
+ f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ());
}
static void
-runf_sv_thruput (sv_float f (sv_float, sv_bool))
+runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t))
{
- for (int i = 0; i < N; i += sv_float_len ())
- f (sv_float_load (Af+i), svptrue_b32 ());
+ for (int i = 0; i < N; i += svcntw ())
+ f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ());
}
static void
-run_sv_latency (sv_double f (sv_double, sv_bool))
+run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t))
{
- volatile sv_bool vsel = svptrue_b64 ();
- sv_bool sel = vsel;
- sv_double prev = sv_double_dup (0);
- for (int i = 0; i < N; i += sv_double_len ())
- prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
+ volatile svbool_t vsel = svptrue_b64 ();
+ svbool_t sel = vsel;
+ svfloat64_t prev = svdup_f64 (0);
+ for (int i = 0; i < N; i += svcntd ())
+ prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev),
+ svptrue_b64 ());
}
static void
-runf_sv_latency (sv_float f (sv_float, sv_bool))
+runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t))
{
- volatile sv_bool vsel = svptrue_b32 ();
- sv_bool sel = vsel;
- sv_float prev = sv_float_dup (0);
- for (int i = 0; i < N; i += sv_float_len ())
- prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
+ volatile svbool_t vsel = svptrue_b32 ();
+ svbool_t sel = vsel;
+ svfloat32_t prev = svdup_f32 (0);
+ for (int i = 0; i < N; i += svcntw ())
+ prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev),
+ svptrue_b32 ());
}
#endif
@@ -377,7 +306,11 @@ static uint64_t
tic (void)
{
struct timespec ts;
+#if defined(_MSC_VER)
+ if (!timespec_get (&ts, TIME_UTC))
+#else
if (clock_gettime (CLOCK_REALTIME, &ts))
+#endif
abort ();
return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}
@@ -405,9 +338,11 @@ bench1 (const struct fun *f, int type, double lo, double hi)
int vlen = 1;
if (f->vec == 'n')
- vlen = f->prec == 'd' ? v_double_len() : v_float_len();
+ vlen = f->prec == 'd' ? 2 : 4;
+#if WANT_SVE_TESTS
else if (f->vec == 's')
- vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
+ vlen = f->prec == 'd' ? svcntd () : svcntw ();
+#endif
if (f->prec == 'd' && type == 't' && f->vec == 0)
TIMEIT (run_thruput, f->fun.d);
@@ -417,7 +352,7 @@ bench1 (const struct fun *f, int type, double lo, double hi)
TIMEIT (runf_thruput, f->fun.f);
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
TIMEIT (runf_latency, f->fun.f);
-#ifdef __vpcs
+#if __aarch64__ && __linux__
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
TIMEIT (run_vn_thruput, f->fun.vnd);
else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
@@ -427,7 +362,7 @@ bench1 (const struct fun *f, int type, double lo, double hi)
else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
TIMEIT (runf_vn_latency, f->fun.vnf);
#endif
-#if WANT_SVE_MATH
+#if WANT_SVE_TESTS
else if (f->prec == 'd' && type == 't' && f->vec == 's')
TIMEIT (run_sv_thruput, f->fun.svd);
else if (f->prec == 'd' && type == 'l' && f->vec == 's')
@@ -640,3 +575,7 @@ main (int argc, char *argv[])
}
return 0;
}
+
+#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
index 84c4e68650ac..261ab02f55c3 100644
--- a/math/test/mathbench_funcs.h
+++ b/math/test/mathbench_funcs.h
@@ -1,27 +1,13 @@
/*
* Function entries for mathbench.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* clang-format off */
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp10, -9.9, 9.9)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
D (xpow, 0.01, 11.1)
D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
F (xpowf, 0.01, 11.1)
F (ypowf, -9.9, 9.9)
@@ -31,32 +17,105 @@ F (ypowf, -9.9, 9.9)
{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
-#ifdef __vpcs
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
+#if WANT_TRIGPI_TESTS
+F (arm_math_cospif, -0.9, 0.9)
+D (arm_math_cospi, -0.9, 0.9)
+F (arm_math_sinpif, -0.9, 0.9)
+D (arm_math_sinpi, -0.9, 0.9)
+F (arm_math_tanpif, -0.9, 0.9)
+D (arm_math_tanpi, -0.9, 0.9)
+{"sincospif", 'f', 0, -0.9, 0.9, {.f = sincospif_wrap}},
+{"sincospi", 'd', 0, -0.9, 0.9, {.d = sincospi_wrap}},
+#endif
+#if WANT_EXPERIMENTAL_MATH
+D (arm_math_erf, -6.0, 6.0)
+F (arm_math_erff, -4.0, 4.0)
+{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
+{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
+{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
+#endif
+#if __aarch64__ && __linux__
+{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
+{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
+{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}},
+{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}},
+{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}},
+{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}},
+{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}},
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+{"x_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = x_Z_powf}},
+{"y_ZGVnN4vv_powf", 'f', 'n', -10.0, 10.0, {.vnf = y_Z_powf}},
+{"_ZGVnN4vl4_modff", 'f', 'n', -10.0, 10.0, {.vnf = _Z_modff_wrap}},
+{"_ZGVnN2vl8_modf", 'd', 'n', -10.0, 10.0, {.vnd = _Z_modf_wrap}},
+{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}},
+{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}},
+{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}},
+{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}},
VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+# if WANT_TRIGPI_TESTS
+VNF (_ZGVnN4v_cospif, -0.9, 0.9)
+VND (_ZGVnN2v_cospi, -0.9, 0.9)
+VNF (_ZGVnN4v_sinpif, -0.9, 0.9)
+VND (_ZGVnN2v_sinpi, -0.9, 0.9)
+VNF (_ZGVnN4v_tanpif, -0.9, 0.9)
+VND (_ZGVnN2v_tanpi, -0.9, 0.9)
+{"_ZGVnN4vl4l4_sincospif", 'f', 'n', -0.9, 0.9, {.vnf = _Z_sincospif_wrap}},
+{"_ZGVnN2vl8l8_sincospi", 'd', 'n', -0.9, 0.9, {.vnd = _Z_sincospi_wrap}},
+# endif
+#endif
+
+#if WANT_SVE_TESTS
+{ "_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, { .svf = _Z_sv_atan2f_wrap } },
+{ "_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, { .svd = _Z_sv_atan2_wrap } },
+{ "_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, { .svf = _Z_sv_hypotf_wrap } },
+{ "_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, { .svd = _Z_sv_hypot_wrap } },
+{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}},
+{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}},
+{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}},
+{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}},
+{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}},
+{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}},
+{"_ZGVsMxvl4_modff", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_modff_wrap}},
+{"_ZGVsMxvl8_modf", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_modf_wrap}},
+{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}},
+{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}},
+{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}},
+{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}},
+# if WANT_TRIGPI_TESTS
+SVF (_ZGVsMxv_cospif, -0.9, 0.9)
+SVD (_ZGVsMxv_cospi, -0.9, 0.9)
+SVF (_ZGVsMxv_sinpif, -0.9, 0.9)
+SVD (_ZGVsMxv_sinpi, -0.9, 0.9)
+SVF (_ZGVsMxv_tanpif, -0.9, 0.9)
+SVD (_ZGVsMxv_tanpi, -0.9, 0.9)
+{"_ZGVsMxvl4l4_sincospif", 'f', 's', -0.9, 0.9, {.svf = _Z_sv_sincospif_wrap}},
+{"_ZGVsMxvl8l8_sincospi", 'd', 's', -0.9, 0.9, {.svd = _Z_sv_sincospi_wrap}},
+# endif
+# if WANT_EXPERIMENTAL_MATH
+{"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
+{"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
+# endif
#endif
- /* clang-format on */
+ /* clang-format on */
+
+#define _ZSF1(fun, a, b) F (fun##f, a, b)
+#define _ZSD1(f, a, b) D (f, a, b)
+
+#define _ZVF1(fun, a, b) VNF (_ZGVnN4v_##fun##f, a, b)
+#define _ZVD1(f, a, b) VND (_ZGVnN2v_##f, a, b)
+
+#define _ZSVF1(fun, a, b) SVF (_ZGVsMxv_##fun##f, a, b)
+#define _ZSVD1(f, a, b) SVD (_ZGVsMxv_##f, a, b)
+
+/* No auto-generated wrappers for binary functions - they have be
+ manually defined in mathbench_wrappers.h. We have to define silent
+ macros for them anyway as they will be emitted by TEST_SIG. */
+#define _ZSF2(...)
+#define _ZSD2(...)
+#define _ZVF2(...)
+#define _ZVD2(...)
+#define _ZSVF2(...)
+#define _ZSVD2(...)
+
+#include "test/mathbench_funcs_gen.h"
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
index 062b9db56de5..32dcee36530a 100644
--- a/math/test/mathbench_wrappers.h
+++ b/math/test/mathbench_wrappers.h
@@ -1,24 +1,314 @@
/*
* Function wrappers for mathbench.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifdef __vpcs
+#if WANT_EXPERIMENTAL_MATH
+static double
+atan2_wrap (double x)
+{
+ return atan2 (5.0, x);
+}
+
+static float
+atan2f_wrap (float x)
+{
+ return atan2f (5.0f, x);
+}
+
+static double
+powi_wrap (double x)
+{
+ return __builtin_powi (x, (int) round (x));
+}
+#endif /* WANT_EXPERIMENTAL_MATH. */
+
+#if __aarch64__ && __linux__
+
+__vpcs static float32x4_t
+_Z_sincospif_wrap (float32x4_t x)
+{
+ float s[4], c[4];
+ _ZGVnN4vl4l4_sincospif (x, s, c);
+ return vld1q_f32 (s) + vld1q_f32 (c);
+}
+
+__vpcs static float64x2_t
+_Z_sincospi_wrap (float64x2_t x)
+{
+ double s[2], c[2];
+ _ZGVnN2vl8l8_sincospi (x, s, c);
+ return vld1q_f64 (s) + vld1q_f64 (c);
+}
-__vpcs static v_float
-xy_Z_powf (v_float x)
+__vpcs static float64x2_t
+_Z_atan2_wrap (float64x2_t x)
+{
+ return _ZGVnN2vv_atan2 (vdupq_n_f64 (5.0), x);
+}
+
+__vpcs static float32x4_t
+_Z_atan2f_wrap (float32x4_t x)
+{
+ return _ZGVnN4vv_atan2f (vdupq_n_f32 (5.0f), x);
+}
+
+__vpcs static float32x4_t
+_Z_hypotf_wrap (float32x4_t x)
+{
+ return _ZGVnN4vv_hypotf (vdupq_n_f32 (5.0f), x);
+}
+
+__vpcs static float64x2_t
+_Z_hypot_wrap (float64x2_t x)
+{
+ return _ZGVnN2vv_hypot (vdupq_n_f64 (5.0), x);
+}
+
+__vpcs static float32x4_t
+xy_Z_powf (float32x4_t x)
{
return _ZGVnN4vv_powf (x, x);
}
-__vpcs static v_double
-xy_Z_pow (v_double x)
+__vpcs static float32x4_t
+x_Z_powf (float32x4_t x)
+{
+ return _ZGVnN4vv_powf (x, vdupq_n_f32 (23.4));
+}
+
+__vpcs static float32x4_t
+y_Z_powf (float32x4_t x)
+{
+ return _ZGVnN4vv_powf (vdupq_n_f32 (2.34), x);
+}
+
+__vpcs static float64x2_t
+xy_Z_pow (float64x2_t x)
{
return _ZGVnN2vv_pow (x, x);
}
+__vpcs static float64x2_t
+x_Z_pow (float64x2_t x)
+{
+ return _ZGVnN2vv_pow (x, vdupq_n_f64 (23.4));
+}
+
+__vpcs static float64x2_t
+y_Z_pow (float64x2_t x)
+{
+ return _ZGVnN2vv_pow (vdupq_n_f64 (2.34), x);
+}
+
+__vpcs static float32x4_t
+_Z_modff_wrap (float32x4_t x)
+{
+ float y[4];
+ float32x4_t ret = _ZGVnN4vl4_modff (x, y);
+ return ret + vld1q_f32 (y);
+}
+
+__vpcs static float64x2_t
+_Z_modf_wrap (float64x2_t x)
+{
+ double y[2];
+ float64x2_t ret = _ZGVnN2vl8_modf (x, y);
+ return ret + vld1q_f64 (y);
+}
+
+__vpcs static float32x4_t
+_Z_sincosf_wrap (float32x4_t x)
+{
+ float s[4], c[4];
+ _ZGVnN4vl4l4_sincosf (x, s, c);
+ return vld1q_f32 (s) + vld1q_f32 (c);
+}
+
+__vpcs static float32x4_t
+_Z_cexpif_wrap (float32x4_t x)
+{
+ float32x4x2_t sc = _ZGVnN4v_cexpif (x);
+ return sc.val[0] + sc.val[1];
+}
+
+__vpcs static float64x2_t
+_Z_sincos_wrap (float64x2_t x)
+{
+ double s[2], c[2];
+ _ZGVnN2vl8l8_sincos (x, s, c);
+ return vld1q_f64 (s) + vld1q_f64 (c);
+}
+
+__vpcs static float64x2_t
+_Z_cexpi_wrap (float64x2_t x)
+{
+ float64x2x2_t sc = _ZGVnN2v_cexpi (x);
+ return sc.val[0] + sc.val[1];
+}
+
+#endif
+
+#if WANT_SVE_TESTS
+
+static svfloat32_t
+_Z_sv_atan2f_wrap (svfloat32_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg);
+}
+
+static svfloat64_t
+_Z_sv_atan2_wrap (svfloat64_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg);
+}
+
+static svfloat32_t
+_Z_sv_hypotf_wrap (svfloat32_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg);
+}
+
+static svfloat64_t
+_Z_sv_hypot_wrap (svfloat64_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg);
+}
+
+static svfloat32_t
+xy_Z_sv_powf (svfloat32_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_powf (x, x, pg);
+}
+
+static svfloat32_t
+x_Z_sv_powf (svfloat32_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg);
+}
+
+static svfloat32_t
+y_Z_sv_powf (svfloat32_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg);
+}
+
+static svfloat64_t
+xy_Z_sv_pow (svfloat64_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_pow (x, x, pg);
+}
+
+static svfloat64_t
+x_Z_sv_pow (svfloat64_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg);
+}
+
+static svfloat64_t
+y_Z_sv_pow (svfloat64_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg);
+}
+
+static svfloat32_t
+_Z_sv_sincospif_wrap (svfloat32_t x, svbool_t pg)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincospif (x, s, c, pg);
+ return svadd_x (pg, svld1 (pg, s), svld1 (pg, c));
+}
+
+static svfloat64_t
+_Z_sv_sincospi_wrap (svfloat64_t x, svbool_t pg)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincospi (x, s, c, pg);
+ return svadd_x (pg, svld1 (pg, s), svld1 (pg, c));
+}
+
+static svfloat32_t
+_Z_sv_modff_wrap (svfloat32_t x, svbool_t pg)
+{
+ float i[svcntw ()];
+ svfloat32_t r = _ZGVsMxvl4_modff (x, i, pg);
+ return svadd_x (pg, r, svld1 (pg, i));
+}
+
+static svfloat64_t
+_Z_sv_modf_wrap (svfloat64_t x, svbool_t pg)
+{
+ double i[svcntd ()];
+ svfloat64_t r = _ZGVsMxvl8_modf (x, i, pg);
+ return svadd_x (pg, r, svld1 (pg, i));
+}
+
+static svfloat32_t
+_Z_sv_sincosf_wrap (svfloat32_t x, svbool_t pg)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincosf (x, s, c, pg);
+ return svadd_x (pg, svld1 (pg, s), svld1 (pg, s));
+}
+
+static svfloat32_t
+_Z_sv_cexpif_wrap (svfloat32_t x, svbool_t pg)
+{
+ svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg);
+ return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1));
+}
+
+static svfloat64_t
+_Z_sv_sincos_wrap (svfloat64_t x, svbool_t pg)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincos (x, s, c, pg);
+ return svadd_x (pg, svld1 (pg, s), svld1 (pg, s));
+}
+
+static svfloat64_t
+_Z_sv_cexpi_wrap (svfloat64_t x, svbool_t pg)
+{
+ svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg);
+ return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1));
+}
+
+# if WANT_EXPERIMENTAL_MATH
+
+static svfloat32_t
+_Z_sv_powi_wrap (svfloat32_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg);
+}
+
+static svfloat64_t
+_Z_sv_powk_wrap (svfloat64_t x, svbool_t pg)
+{
+ return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+# endif
+
+#endif
+
+#if __aarch64__
+static float
+sincospif_wrap (float x)
+{
+ float s, c;
+ arm_math_sincospif (x, &s, &c);
+ return s + c;
+}
+
+static double
+sincospi_wrap (double x)
+{
+ double s, c;
+ arm_math_sincospi (x, &s, &c);
+ return s + c;
+}
#endif
static double
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 834233fdde9d..6e81f0d7b634 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,10 +1,12 @@
/*
* mathtest.c - test rig for mathlib
*
- * Copyright (c) 1998-2023, Arm Limited.
+ * Copyright (c) 1998-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+/* clang-format off */
+#define _GNU_SOURCE
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
@@ -196,11 +198,9 @@ int is_complex_rettype(int rettype) {
#define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
#define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
-#ifndef PL
/* sincosf wrappers for easier testing. */
static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
-#endif
test_func tfuncs[] = {
/* trigonometric */
@@ -220,10 +220,9 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
-#ifndef PL
TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-#endif
+
/* hyperbolic */
TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
@@ -254,7 +253,9 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
TFUNC(at_s,rt_s, expm1f, ULPUNIT),
+#if WANT_EXP10_TESTS
TFUNC(at_d,rt_d, exp10, ULPUNIT),
+#endif
/* power */
TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@@ -1707,3 +1708,4 @@ void undef_func() {
failed++;
puts("ERROR: undefined function called");
}
+/* clang-format on */
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 5b3e9b4f18e4..dd8ceb068141 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -1,7 +1,7 @@
/*
* dotest.c - actually generate mathlib test cases
*
- * Copyright (c) 1999-2019, Arm Limited.
+ * Copyright (c) 1999-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -18,6 +18,35 @@
#define MPFR_PREC 96 /* good enough for float or double + a few extra bits */
+#if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0)
+int
+mpfr_tanpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd)
+{
+ MPFR_DECL_INIT (frd, MPFR_PREC);
+ mpfr_const_pi (frd, GMP_RNDN);
+ mpfr_mul (frd, frd, arg, GMP_RNDN);
+ return mpfr_tan (ret, frd, GMP_RNDN);
+}
+
+int
+mpfr_sinpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd)
+{
+ MPFR_DECL_INIT (frd, MPFR_PREC);
+ mpfr_const_pi (frd, GMP_RNDN);
+ mpfr_mul (frd, frd, arg, GMP_RNDN);
+ return mpfr_sin (ret, frd, GMP_RNDN);
+}
+
+int
+mpfr_cospi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd)
+{
+ MPFR_DECL_INIT (frd, MPFR_PREC);
+ mpfr_const_pi (frd, GMP_RNDN);
+ mpfr_mul (frd, frd, arg, GMP_RNDN);
+ return mpfr_cos (ret, frd, GMP_RNDN);
+}
+#endif
+
extern int lib_fo, lib_no_arith, ntests;
/*
@@ -454,6 +483,7 @@ void universal_wrapper(wrapperctx *ctx)
}
}
+/* clang-format off */
Testable functions[] = {
/*
* Trig functions: sin, cos, tan. We test the core function
@@ -479,6 +509,18 @@ Testable functions[] = {
cases_uniform_float, 0x39800000, 0x41800000},
{"sincosf_cosf", (funcptr)mpfr_cos, args1f, {NULL},
cases_uniform_float, 0x39800000, 0x41800000},
+ {"sinpi", (funcptr)mpfr_sinpi, args1, {NULL},
+ cases_uniform, 0x3e400000, 0x40300000},
+ {"sinpif", (funcptr)mpfr_sinpi, args1f, {NULL},
+ cases_uniform_float, 0x39800000, 0x41800000},
+ {"cospi", (funcptr)mpfr_cospi, args1, {NULL},
+ cases_uniform, 0x3e400000, 0x40300000},
+ {"cospif", (funcptr)mpfr_cospi, args1f, {NULL},
+ cases_uniform_float, 0x39800000, 0x41800000},
+ {"tanpi", (funcptr)mpfr_tanpi, args1, {NULL},
+ cases_uniform, 0x3e400000, 0x40300000},
+ {"tanpif", (funcptr)mpfr_tanpi, args1f, {NULL},
+ cases_uniform_float, 0x39800000, 0x41800000},
/*
* Inverse trig: asin, acos. Between 1 and -1, of course. acos
* goes down to 2^-54, asin to 2^-27.
@@ -708,6 +750,7 @@ Testable functions[] = {
{"tgammaf", (funcptr)mpfr_gamma, args1f, {NULL}, cases_uniform_float, 0x2f800000, 0x43000000},
{"tgamma", (funcptr)mpfr_gamma, args1, {NULL}, cases_uniform, 0x3c000000, 0x40800000},
};
+/* clang-format on */
const int nfunctions = ( sizeof(functions)/sizeof(*functions) );
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index e2e03e3ae761..672908f355c4 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
# ULP error check script.
#
-# Copyright (c) 2019-2023, Arm Limited.
+# Copyright (c) 2019-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
#set -x
@@ -20,260 +20,83 @@ FAIL=0
PASS=0
t() {
- [ $r = "n" ] && Lt=$L || Lt=$Ldir
- $emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+ # First argument: routine name
+ routine=$1; shift
+ # Second and third argument: lo and hi bounds
+ # Extra processing needed for bivariate routines
+ IFS=',' read -ra LO <<< "$1"; shift
+ IFS=',' read -ra HI <<< "$1"; shift
+ ITV="${LO[0]} ${HI[0]}"
+ for i in "${!LO[@]}"; do
+ [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}"
+ done
+ # Fourth argument: number of test points
+ n=$1; shift
+ # Any remaining arguments forwards directly to ulp tool
+ extra_flags="$@"
+
+ # Read ULP limits, fenv expectation and control values from autogenerated files
+ limits_file=$LIMITS
+ [ $r == "n" ] || limits_file=${limits_file}_nn
+ L=$(grep "^$routine " $limits_file | awk '{print $2}')
+ [ -n "$L" ] || { echo ERROR: Could not determine ULP limit for $routine in $limits_file && false; }
+ cvals=($(grep "^$routine " $CVALS | awk '{print $2}'))
+
+ if grep -q "^$routine$" $DISABLE_FENV; then extra_flags="$extra_flags -f"; fi
+ # Emulate a do-while loop to loop over cvals, but still execute once if it is empty
+ while : ; do
+ # Empty string if we are at the end of cvals array
+ c_arg=""
+ [ -z "${cvals[0]:-}" ] || c_arg="-c ${cvals[0]}"
+ $emu ./ulp -e $L $flags $extra_flags -r $r $c_arg $routine $ITV $n && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+ # Shift cvals by 1, and break if it is now empty
+ cvals=("${cvals[@]:1}")
+ [ -n "${cvals[0]:-}" ] || break
+ done
+
+ # Run ULP tool
+
}
check() {
- $emu ./ulp -f -q "$@" >/dev/null
+ $emu ./ulp -f -q "$@"
}
-Ldir=0.5
+if [[ $WANT_EXPERIMENTAL_MATH -eq 1 ]] && [[ $WANT_SVE_TESTS -eq 1 ]] && [[ $USE_MPFR -eq 0 ]]; then
+ # No guarantees about powi accuracy, so regression-test for exactness
+ # w.r.t. the custom reference impl in ulp_wrappers.h
+ if [ -z "$FUNC" ] || [ "$FUNC" == "_ZGVsMxvv_powi" ]; then
+ check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000
+ check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000
+ check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000
+ check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000
+ fi
+ if [ -z "$FUNC" ] || [ "$FUNC" == "_ZGVsMxvv_powk" ]; then
+ check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000
+ check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000
+ check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000
+ check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000
+ fi
+fi
+
+# Test generic routines in all rounding modes
for r in $rmodes
do
-L=0.01
-t exp 0 0xffff000000000000 10000
-t exp 0x1p-6 0x1p6 40000
-t exp -0x1p-6 -0x1p6 40000
-t exp 633.3 733.3 10000
-t exp -633.3 -777.3 10000
-
-L=0.01
-t exp2 0 0xffff000000000000 10000
-t exp2 0x1p-6 0x1p6 40000
-t exp2 -0x1p-6 -0x1p6 40000
-t exp2 633.3 733.3 10000
-t exp2 -633.3 -777.3 10000
-
-L=0.02
-t log 0 0xffff000000000000 10000
-t log 0x1p-4 0x1p4 40000
-t log 0 inf 40000
-
-L=0.05
-t log2 0 0xffff000000000000 10000
-t log2 0x1p-4 0x1p4 40000
-t log2 0 inf 40000
-
-L=0.05
-t pow 0.5 2.0 x 0 inf 20000
-t pow -0.5 -2.0 x 0 inf 20000
-t pow 0.5 2.0 x -0 -inf 20000
-t pow -0.5 -2.0 x -0 -inf 20000
-t pow 0.5 2.0 x 0x1p-10 0x1p10 40000
-t pow 0.5 2.0 x -0x1p-10 -0x1p10 40000
-t pow 0 inf x 0.5 2.0 80000
-t pow 0 inf x -0.5 -2.0 80000
-t pow 0x1.fp-1 0x1.08p0 x 0x1p8 0x1p17 80000
-t pow 0x1.fp-1 0x1.08p0 x -0x1p8 -0x1p17 80000
-t pow 0 0x1p-1000 x 0 1.0 50000
-t pow 0x1p1000 inf x 0 1.0 50000
-t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
-t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
-t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
-
-L=0.02
-t exp10 0 0x1p-47 5000
-t exp10 -0 -0x1p-47 5000
-t exp10 0x1p-47 1 50000
-t exp10 -0x1p-47 -1 50000
-t exp10 1 0x1.34413509f79ffp8 50000
-t exp10 -1 -0x1.434e6420f4374p8 50000
-t exp10 0x1.34413509f79ffp8 inf 5000
-t exp10 -0x1.434e6420f4374p8 -inf 5000
-
-L=1.0
-Ldir=0.9
-t erf 0 0xffff000000000000 10000
-t erf 0x1p-1022 0x1p-26 40000
-t erf -0x1p-1022 -0x1p-26 40000
-t erf 0x1p-26 0x1p3 40000
-t erf -0x1p-26 -0x1p3 40000
-t erf 0 inf 40000
-Ldir=0.5
-
-L=0.01
-t expf 0 0xffff0000 10000
-t expf 0x1p-14 0x1p8 50000
-t expf -0x1p-14 -0x1p8 50000
-
-L=0.01
-t exp2f 0 0xffff0000 10000
-t exp2f 0x1p-14 0x1p8 50000
-t exp2f -0x1p-14 -0x1p8 50000
-
-L=0.32
-t logf 0 0xffff0000 10000
-t logf 0x1p-4 0x1p4 50000
-t logf 0 inf 50000
-
-L=0.26
-t log2f 0 0xffff0000 10000
-t log2f 0x1p-4 0x1p4 50000
-t log2f 0 inf 50000
-
-L=0.06
-t sinf 0 0xffff0000 10000
-t sinf 0x1p-14 0x1p54 50000
-t sinf -0x1p-14 -0x1p54 50000
-
-L=0.06
-t cosf 0 0xffff0000 10000
-t cosf 0x1p-14 0x1p54 50000
-t cosf -0x1p-14 -0x1p54 50000
-
-L=0.06
-t sincosf_sinf 0 0xffff0000 10000
-t sincosf_sinf 0x1p-14 0x1p54 50000
-t sincosf_sinf -0x1p-14 -0x1p54 50000
-
-L=0.06
-t sincosf_cosf 0 0xffff0000 10000
-t sincosf_cosf 0x1p-14 0x1p54 50000
-t sincosf_cosf -0x1p-14 -0x1p54 50000
-
-L=0.4
-t powf 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000
-t powf 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000
-t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000
-t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000
-t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000
-t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
-
-L=0.6
-Ldir=0.9
-t erff 0 0xffff0000 10000
-t erff 0x1p-127 0x1p-26 40000
-t erff -0x1p-127 -0x1p-26 40000
-t erff 0x1p-26 0x1p3 40000
-t erff -0x1p-26 -0x1p3 40000
-t erff 0 inf 40000
-Ldir=0.5
-
+ while read F LO HI N
+ do
+ [[ -z $F ]] || t $F $LO $HI $N
+ done << EOF
+$(grep "\b$FUNC\b" $GEN_ITVS)
+EOF
done
-# vector functions
-
-Ldir=0.5
-r='n'
-flags="${ULPFLAGS:--q}"
-
-range_exp='
- 0 0xffff000000000000 10000
- 0x1p-6 0x1p6 400000
- -0x1p-6 -0x1p6 400000
- 633.3 733.3 10000
- -633.3 -777.3 10000
-'
-
-range_log='
- 0 0xffff000000000000 10000
- 0x1p-4 0x1p4 400000
- 0 inf 400000
-'
-
-range_pow='
- 0x1p-1 0x1p1 x 0x1p-10 0x1p10 50000
- 0x1p-1 0x1p1 x -0x1p-10 -0x1p10 50000
- 0x1p-500 0x1p500 x 0x1p-1 0x1p1 50000
- 0x1p-500 0x1p500 x -0x1p-1 -0x1p1 50000
- 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p16 50000
- 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16 50000
-'
-
-range_sin='
- 0 0x1p23 500000
- -0 -0x1p23 500000
- 0x1p23 inf 10000
- -0x1p23 -inf 10000
-'
-range_cos="$range_sin"
-
-range_expf='
- 0 0xffff0000 10000
- 0x1p-14 0x1p8 500000
- -0x1p-14 -0x1p8 500000
-'
-
-range_expf_1u="$range_expf"
-range_exp2f="$range_expf"
-range_exp2f_1u="$range_expf"
-
-range_logf='
- 0 0xffff0000 10000
- 0x1p-4 0x1p4 500000
-'
-
-range_sinf='
- 0 0x1p20 500000
- -0 -0x1p20 500000
- 0x1p20 inf 10000
- -0x1p20 -inf 10000
-'
-range_cosf="$range_sinf"
-
-range_powf='
- 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000
- 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000
- 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000
- 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000
- 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000
- 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
-'
-
-# error limits
-L_exp=1.9
-L_log=1.2
-L_pow=0.05
-L_sin=3.0
-L_cos=3.0
-L_expf=1.49
-L_expf_1u=0.4
-L_exp2f=1.49
-L_exp2f_1u=0.4
-L_logf=2.9
-L_sinf=1.4
-L_cosf=1.4
-L_powf=2.1
-
-while read G F D
+# Only test arch-specific routines in round-to-nearest, with sign of zero ignored (-z flag)
+r=n
+while read F LO HI N
do
- case "$G" in \#*) continue ;; esac
- eval range="\${range_$G}"
- eval L="\${L_$G}"
- while read X
- do
- [ -n "$X" ] || continue
- case "$X" in \#*) continue ;; esac
- disable_fenv=""
- if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
- # If library was built with SIMD exceptions
- # disabled, disable fenv checking in ulp
- # tool. Otherwise, fenv checking may still be
- # disabled by adding -f to the end of the run
- # line.
- disable_fenv="-f"
- fi
- t $D $disable_fenv $F $X
- done << EOF
-$range
-
-EOF
+ [[ -z $F ]] || t $F $LO $HI $N -z
done << EOF
-# group symbol run
-exp _ZGVnN2v_exp
-log _ZGVnN2v_log
-pow _ZGVnN2vv_pow -f
-sin _ZGVnN2v_sin -z
-cos _ZGVnN2v_cos
-expf _ZGVnN4v_expf
-expf_1u _ZGVnN4v_expf_1u -f
-exp2f _ZGVnN4v_exp2f
-exp2f_1u _ZGVnN4v_exp2f_1u -f
-logf _ZGVnN4v_logf
-sinf _ZGVnN4v_sinf -z
-cosf _ZGVnN4v_cosf
-powf _ZGVnN4vv_powf -f
+$(grep "\b$FUNC\b" $ARCH_ITVS)
EOF
[ 0 -eq $FAIL ] || {
diff --git a/math/test/test_defs.h b/math/test/test_defs.h
new file mode 100644
index 000000000000..d0656c9e1d84
--- /dev/null
+++ b/math/test/test_defs.h
@@ -0,0 +1,31 @@
+/*
+ * Helper macros for emitting various details about routines for consumption by
+ * runulp.sh.
+ *
+ * Copyright (c) 2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
+ */
+
+#define TEST_ULP(f, l) TEST_ULP f l
+#define TEST_ULP_NONNEAREST(f, l) TEST_ULP_NONNEAREST f l
+
+/* Emit routine name if e == 0 and f is expected to correctly trigger fenv
+ exceptions. e allows declaration to be emitted conditionally on
+ WANT_SIMD_EXCEPT - defer expansion by one pass to allow those flags to be
+ expanded properly. */
+#define TEST_DISABLE_FENV(f) TEST_DISABLE_FENV f
+#define TEST_DISABLE_FENV_IF_NOT(f, e) TEST_DISABLE_FENV_IF_NOT_ (f, e)
+#define TEST_DISABLE_FENV_IF_NOT_(f, e) TEST_DISABLE_FENV_IF_NOT_##e (f)
+#define TEST_DISABLE_FENV_IF_NOT_0(f) TEST_DISABLE_FENV (f)
+#define TEST_DISABLE_FENV_IF_NOT_1(f)
+
+#define TEST_INTERVAL(f, lo, hi, n) TEST_INTERVAL f lo hi n
+#define TEST_SYM_INTERVAL(f, lo, hi, n) \
+ TEST_INTERVAL (f, lo, hi, n) \
+ TEST_INTERVAL (f, -lo, -hi, n)
+// clang-format off
+#define TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \
+ TEST_INTERVAL f xlo,ylo xhi,yhi n
+// clang-format on
+
+#define TEST_CONTROL_VALUE(f, c) TEST_CONTROL_VALUE f c
diff --git a/pl/math/test/testcases/directed/acos.tst b/math/test/testcases/directed/acos.tst
index a73dcd25965b..7889e62f4459 100644
--- a/pl/math/test/testcases/directed/acos.tst
+++ b/math/test/testcases/directed/acos.tst
@@ -1,6 +1,6 @@
; acos.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=acos op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/acosf.tst b/math/test/testcases/directed/acosf.tst
index 9e453e3bff5e..0c2165967abb 100644
--- a/pl/math/test/testcases/directed/acosf.tst
+++ b/math/test/testcases/directed/acosf.tst
@@ -1,6 +1,6 @@
; acosf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=acosf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/acosh.tst b/math/test/testcases/directed/acosh.tst
index dd962bd391da..b78d64bb8ea7 100644
--- a/pl/math/test/testcases/directed/acosh.tst
+++ b/math/test/testcases/directed/acosh.tst
@@ -1,6 +1,6 @@
; acosh.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/acoshf.tst b/math/test/testcases/directed/acoshf.tst
index 606c615f9b74..9eec2caf014d 100644
--- a/pl/math/test/testcases/directed/acoshf.tst
+++ b/math/test/testcases/directed/acoshf.tst
@@ -1,6 +1,6 @@
; acoshf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=acoshf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/asin.tst b/math/test/testcases/directed/asin.tst
index 6180d7849d90..7b916f3624c0 100644
--- a/pl/math/test/testcases/directed/asin.tst
+++ b/math/test/testcases/directed/asin.tst
@@ -1,6 +1,6 @@
; asin.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=asin op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/asinf.tst b/math/test/testcases/directed/asinf.tst
index a85b2593768d..d5830b99b620 100644
--- a/pl/math/test/testcases/directed/asinf.tst
+++ b/math/test/testcases/directed/asinf.tst
@@ -1,6 +1,6 @@
; asinf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=asinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/asinh.tst b/math/test/testcases/directed/asinh.tst
index 1485dfeffecf..9b250a14f50c 100644
--- a/pl/math/test/testcases/directed/asinh.tst
+++ b/math/test/testcases/directed/asinh.tst
@@ -1,6 +1,6 @@
; asinh.tst
;
-; Copyright (c) 2022-2023, Arm Limited.
+; Copyright (c) 2022-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/asinhf.tst b/math/test/testcases/directed/asinhf.tst
index eb76a5892a70..f2410e09b03e 100644
--- a/pl/math/test/testcases/directed/asinhf.tst
+++ b/math/test/testcases/directed/asinhf.tst
@@ -1,6 +1,6 @@
; asinhf.tst
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=asinhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/atan.tst b/math/test/testcases/directed/atan.tst
index 4c670553d58f..d29b13245cd5 100644
--- a/pl/math/test/testcases/directed/atan.tst
+++ b/math/test/testcases/directed/atan.tst
@@ -1,6 +1,6 @@
; atan.tst
;
-; Copyright (c) 1999-2023, Arm Limited.
+; Copyright (c) 1999-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/atan2.tst b/math/test/testcases/directed/atan2.tst
index 647b3764072c..3e34e7641f28 100644
--- a/pl/math/test/testcases/directed/atan2.tst
+++ b/math/test/testcases/directed/atan2.tst
@@ -1,6 +1,6 @@
; atan2.tst
;
-; Copyright (c) 1999-2023, Arm Limited.
+; Copyright (c) 1999-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
diff --git a/pl/math/test/testcases/directed/atan2f.tst b/math/test/testcases/directed/atan2f.tst
index 85c5c5d47e10..e637fe0eba24 100644
--- a/pl/math/test/testcases/directed/atan2f.tst
+++ b/math/test/testcases/directed/atan2f.tst
@@ -1,6 +1,6 @@
; atan2f.tst
;
-; Copyright (c) 1999-2023, Arm Limited.
+; Copyright (c) 1999-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
diff --git a/pl/math/test/testcases/directed/atanf.tst b/math/test/testcases/directed/atanf.tst
index 0a0bfc24c605..8739ea89c3a2 100644
--- a/pl/math/test/testcases/directed/atanf.tst
+++ b/math/test/testcases/directed/atanf.tst
@@ -1,6 +1,6 @@
; atanf.tst
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=atanf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/atanh.tst b/math/test/testcases/directed/atanh.tst
index d96ff327fcd9..7ba297e5046c 100644
--- a/pl/math/test/testcases/directed/atanh.tst
+++ b/math/test/testcases/directed/atanh.tst
@@ -1,6 +1,6 @@
; atanh.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/atanhf.tst b/math/test/testcases/directed/atanhf.tst
index 21a68a661a11..010012831b3c 100644
--- a/pl/math/test/testcases/directed/atanhf.tst
+++ b/math/test/testcases/directed/atanhf.tst
@@ -1,6 +1,6 @@
; atanhf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=atanhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/math/test/testcases/directed/cbrtf.tst
index 0dd8d09f1d4f..98942580c7a7 100644
--- a/pl/math/test/testcases/directed/cbrtf.tst
+++ b/math/test/testcases/directed/cbrtf.tst
@@ -1,6 +1,6 @@
; cbrtf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=cbrtf op1=7f800000 result=7f800000 errno=0
diff --git a/pl/math/test/testcases/directed/cosh.tst b/math/test/testcases/directed/cosh.tst
index c4efacb7272d..4dc6fe4846dc 100644
--- a/pl/math/test/testcases/directed/cosh.tst
+++ b/math/test/testcases/directed/cosh.tst
@@ -1,6 +1,6 @@
; cosh.tst
;
-; Copyright (c) 1999-2023, Arm Limited.
+; Copyright (c) 1999-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/coshf.tst b/math/test/testcases/directed/coshf.tst
index 2b967e78f4b4..d224baf486a5 100644
--- a/pl/math/test/testcases/directed/coshf.tst
+++ b/math/test/testcases/directed/coshf.tst
@@ -1,6 +1,6 @@
; coshf.tst
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=coshf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/erfc.tst b/math/test/testcases/directed/erfc.tst
index c03fc591da47..249e7343eac2 100644
--- a/pl/math/test/testcases/directed/erfc.tst
+++ b/math/test/testcases/directed/erfc.tst
@@ -1,6 +1,6 @@
; erfc.tst - Directed test cases for erfc
;
-; Copyright (c) 2022-2023, Arm Limited.
+; Copyright (c) 2022-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/erfcf.tst b/math/test/testcases/directed/erfcf.tst
index 719baccb2e45..22a1a8f236d8 100644
--- a/pl/math/test/testcases/directed/erfcf.tst
+++ b/math/test/testcases/directed/erfcf.tst
@@ -1,6 +1,6 @@
; erfcf.tst - Directed test cases for erfcf
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=erfcf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/expm1.tst b/math/test/testcases/directed/expm1.tst
index 609d6f479721..3d58c6b3f161 100644
--- a/pl/math/test/testcases/directed/expm1.tst
+++ b/math/test/testcases/directed/expm1.tst
@@ -1,6 +1,6 @@
; expm1.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/expm1f.tst b/math/test/testcases/directed/expm1f.tst
index 44c38420a617..44a15d679870 100644
--- a/pl/math/test/testcases/directed/expm1f.tst
+++ b/math/test/testcases/directed/expm1f.tst
@@ -1,6 +1,6 @@
; expm1f.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=expm1f op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/log10.tst b/math/test/testcases/directed/log10.tst
index 34831436234a..3ff252013498 100644
--- a/pl/math/test/testcases/directed/log10.tst
+++ b/math/test/testcases/directed/log10.tst
@@ -1,6 +1,6 @@
; log10.tst
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/log10f.tst b/math/test/testcases/directed/log10f.tst
index d5744a66f092..5c83e3f5e9b4 100644
--- a/pl/math/test/testcases/directed/log10f.tst
+++ b/math/test/testcases/directed/log10f.tst
@@ -1,6 +1,6 @@
; log10f.tst
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log10f op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/log1p.tst b/math/test/testcases/directed/log1p.tst
index 9ee8c62fc9c0..109413a79e96 100644
--- a/pl/math/test/testcases/directed/log1p.tst
+++ b/math/test/testcases/directed/log1p.tst
@@ -1,6 +1,6 @@
; log1p.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/log1pf.tst b/math/test/testcases/directed/log1pf.tst
index aaa01d67c2b3..9655b9473612 100644
--- a/pl/math/test/testcases/directed/log1pf.tst
+++ b/math/test/testcases/directed/log1pf.tst
@@ -1,6 +1,6 @@
; log1pf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log1pf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/sinh.tst b/math/test/testcases/directed/sinh.tst
index d6a3da896693..ab0d84b84d9e 100644
--- a/pl/math/test/testcases/directed/sinh.tst
+++ b/math/test/testcases/directed/sinh.tst
@@ -1,6 +1,6 @@
; sinh.tst
;
-; Copyright (c) 1999-2023, Arm Limited.
+; Copyright (c) 1999-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/sinhf.tst b/math/test/testcases/directed/sinhf.tst
index 5f7bd1b04137..d9269c0fa405 100644
--- a/pl/math/test/testcases/directed/sinhf.tst
+++ b/math/test/testcases/directed/sinhf.tst
@@ -1,6 +1,6 @@
; sinhf.tst
;
-; Copyright (c) 2009-2023, Arm Limited.
+; Copyright (c) 2009-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=sinhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/tanf.tst b/math/test/testcases/directed/tanf.tst
index 3161f70f4361..e38142df6e3c 100644
--- a/pl/math/test/testcases/directed/tanf.tst
+++ b/math/test/testcases/directed/tanf.tst
@@ -1,6 +1,6 @@
; tanf.tst
;
-; Copyright (c) 2022-2023, Arm Limited.
+; Copyright (c) 2022-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=tanf op1=7fc00001 result=7fc00001 errno=0
diff --git a/pl/math/test/testcases/directed/tanh.tst b/math/test/testcases/directed/tanh.tst
index 78776e6f3924..e842063c0ef7 100644
--- a/pl/math/test/testcases/directed/tanh.tst
+++ b/math/test/testcases/directed/tanh.tst
@@ -1,6 +1,6 @@
; tanh.tst
;
-; Copyright (c) 1999-2023, Arm Limited.
+; Copyright (c) 1999-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/pl/math/test/testcases/directed/tanhf.tst b/math/test/testcases/directed/tanhf.tst
index 603e3107e44f..412aa12b3621 100644
--- a/pl/math/test/testcases/directed/tanhf.tst
+++ b/math/test/testcases/directed/tanhf.tst
@@ -1,6 +1,6 @@
; tanhf.tst
;
-; Copyright (c) 2007-2023, Arm Limited.
+; Copyright (c) 2007-2024, Arm Limited.
; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=tanhf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/trigpi_references.h b/math/test/trigpi_references.h
new file mode 100644
index 000000000000..3dc5a3173436
--- /dev/null
+++ b/math/test/trigpi_references.h
@@ -0,0 +1,106 @@
+/*
+ * Extended precision scalar reference functions for trigpi.
+ *
+ * Copyright (c) 2023-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#ifndef M_PIl
+# define M_PIl 3.141592653589793238462643383279502884l
+#endif
+
+long double
+arm_math_sinpil (long double x)
+{
+ /* sin(inf) should return nan, as defined by C23. */
+ if (isinf (x))
+ return __math_invalid (x);
+
+ long double ax = fabsl (x);
+
+ /* Return 0 for all values above 2^64 to prevent
+ overflow when casting to uint64_t. */
+ if (ax >= 0x1p64)
+ return x < 0 ? -0.0l : 0.0l;
+
+ /* All integer cases should return 0, with unchanged sign for zero. */
+ if (x == 0.0l)
+ return x;
+ if (ax == (uint64_t) ax)
+ return x < 0 ? -0.0l : 0.0l;
+
+ return sinl (x * M_PIl);
+}
+
+long double
+arm_math_cospil (long double x)
+{
+ /* cos(inf) should return nan, as defined by C23. */
+ if (isinf (x))
+ return __math_invalid (x);
+
+ long double ax = fabsl (x);
+
+ if (ax >= 0x1p64)
+ return 1;
+
+ uint64_t m = (uint64_t) ax;
+
+ /* Integer values of cospi(x) should return +/-1.
+ The sign depends on if x is odd or even. */
+ if (m == ax)
+ return (m & 1) ? -1 : 1;
+
+ /* Values of Integer + 0.5 should always return 0. */
+ if (ax - 0.5 == m || ax + 0.5 == m)
+ return 0;
+
+ return cosl (ax * M_PIl);
+}
+
+long double
+arm_math_tanpil (long double x)
+{
+ /* inf and x = n + 0.5 for any integral n should return nan. */
+ if (fabsl (x) >= 0x1p54l)
+ {
+ if (isinf (x))
+ return __math_invalid (x);
+ return x < 0 ? -0.0l : 0.0l;
+ }
+
+ long double i = roundl (x);
+ long double f = x - i;
+ int64_t m = (int64_t) i;
+
+ if (x == 0)
+ {
+ return x;
+ }
+ else if (x == i)
+ {
+ if (x < 0)
+ {
+ return m & 1 ? 0.0l : -0.0l;
+ }
+ else
+ {
+ return m & 1 ? -0.0l : 0.0l;
+ }
+ }
+ else if (fabsl (f) == 0.5l)
+ {
+ if (x < 0)
+ {
+ return m & 1 ? -1.0l / 0.0l : 1.0l / 0.0l;
+ }
+ else
+ {
+ return m & 1 ? 1.0l / 0.0l : -1.0l / 0.0l;
+ }
+ }
+
+ return tanl (f * M_PIl);
+}
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 5ff29972e50e..0a75fe264630 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,10 +1,23 @@
/*
* ULP error checking tool for math functions.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#if WANT_SVE_TESTS
+# if __aarch64__ && __linux__
+# ifdef __clang__
+# pragma clang attribute push(__attribute__((target("sve"))), \
+ apply_to = any(function))
+# else
+# pragma GCC target("+sve")
+# endif
+# else
+# error "SVE not supported - please disable WANT_SVE_TESTS"
+# endif
+#endif
+
#define _GNU_SOURCE
#include <ctype.h>
#include <fenv.h>
@@ -16,6 +29,8 @@
#include <string.h>
#include "mathlib.h"
+#include "trigpi_references.h"
+
/* Don't depend on mpfr by default. */
#ifndef USE_MPFR
# define USE_MPFR 0
@@ -24,50 +39,6 @@
# include <mpfr.h>
#endif
-static inline uint64_t
-asuint64 (double f)
-{
- union
- {
- double f;
- uint64_t i;
- } u = {f};
- return u.i;
-}
-
-static inline double
-asdouble (uint64_t i)
-{
- union
- {
- uint64_t i;
- double f;
- } u = {i};
- return u.f;
-}
-
-static inline uint32_t
-asuint (float f)
-{
- union
- {
- float f;
- uint32_t i;
- } u = {f};
- return u.i;
-}
-
-static inline float
-asfloat (uint32_t i)
-{
- union
- {
- uint32_t i;
- float f;
- } u = {i};
- return u.f;
-}
-
static uint64_t seed = 0x0123456789abcdef;
static uint64_t
rand64 (void)
@@ -198,68 +169,96 @@ next_d2 (void *g)
return (struct args_d2){asdouble (x), asdouble (x2)};
}
-struct conf
-{
- int r;
- int rc;
- int quiet;
- int mpfr;
- int fenv;
- unsigned long long n;
- double softlim;
- double errlim;
- int ignore_zero_sign;
-};
-
/* A bit of a hack: call vector functions twice with the same
input in lane 0 but a different value in other lanes: once
with an in-range value and then with a special case value. */
static int secondcall;
/* Wrappers for vector functions. */
-#ifdef __vpcs
-typedef __f32x4_t v_float;
-typedef __f64x2_t v_double;
+#if __aarch64__ && __linux__
/* First element of fv and dv may be changed by -c argument. */
static float fv[2] = {1.0f, -INFINITY};
static double dv[2] = {1.0, -INFINITY};
-static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
-static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-#if WANT_SVE_MATH
+static inline float32x4_t
+argf (float x)
+{
+ return (float32x4_t){ x, x, x, fv[secondcall] };
+}
+static inline float64x2_t
+argd (double x)
+{
+ return (float64x2_t){ x, dv[secondcall] };
+}
+#if WANT_SVE_TESTS
#include <arm_sve.h>
-typedef __SVFloat32_t sv_float;
-typedef __SVFloat64_t sv_double;
-
-static inline sv_float svargf(float x) {
- int n = svcntw();
- float base[n];
- for (int i=0; i<n; i++)
- base[i] = (float)x;
- base[n-1] = (float) fv[secondcall];
- return svld1(svptrue_b32(), base);
-}
-static inline sv_double svargd(double x) {
- int n = svcntd();
- double base[n];
- for (int i=0; i<n; i++)
- base[i] = x;
- base[n-1] = dv[secondcall];
- return svld1(svptrue_b64(), base);
-}
-static inline float svretf(sv_float vec) {
- int n = svcntw();
- float res[n];
- svst1(svptrue_b32(), res, vec);
- return res[0];
-}
-static inline double svretd(sv_double vec) {
- int n = svcntd();
- double res[n];
- svst1(svptrue_b64(), res, vec);
- return res[0];
+
+static inline svfloat32_t
+svargf (float x)
+{
+ int n = svcntw ();
+ float base[n];
+ for (int i = 0; i < n; i++)
+ base[i] = (float) x;
+ base[n - 1] = (float) fv[secondcall];
+ return svld1 (svptrue_b32 (), base);
+}
+static inline svfloat64_t
+svargd (double x)
+{
+ int n = svcntd ();
+ double base[n];
+ for (int i = 0; i < n; i++)
+ base[i] = x;
+ base[n - 1] = dv[secondcall];
+ return svld1 (svptrue_b64 (), base);
+}
+static inline float
+svretf (svfloat32_t vec, svbool_t pg)
+{
+ return svlastb_f32 (svpfirst (pg, svpfalse ()), vec);
}
+static inline double
+svretd (svfloat64_t vec, svbool_t pg)
+{
+ return svlastb_f64 (svpfirst (pg, svpfalse ()), vec);
+}
+
+static inline svbool_t
+parse_pg (uint64_t p, int is_single)
+{
+ if (is_single)
+ {
+ uint32_t tmp[svcntw ()];
+ for (unsigned i = 0; i < svcntw (); i++)
+ tmp[i] = (p >> i) & 1;
+ return svcmpne (svptrue_b32 (), svld1 (svptrue_b32 (), tmp), 0);
+ }
+ else
+ {
+ uint64_t tmp[svcntd ()];
+ for (unsigned i = 0; i < svcntd (); i++)
+ tmp[i] = (p >> i) & 1;
+ return svcmpne (svptrue_b64 (), svld1 (svptrue_b64 (), tmp), 0);
+ }
+}
+# endif
#endif
+
+struct conf
+{
+ int r;
+ int rc;
+ int quiet;
+ int mpfr;
+ int fenv;
+ unsigned long long n;
+ double softlim;
+ double errlim;
+ int ignore_zero_sign;
+#if WANT_SVE_TESTS
+ svbool_t *pg;
#endif
+};
#include "test/ulp_wrappers.h"
@@ -269,12 +268,19 @@ struct fun
int arity;
int singleprec;
int twice;
+ int is_predicated;
union
{
float (*f1) (float);
float (*f2) (float, float);
double (*d1) (double);
double (*d2) (double, double);
+#if WANT_SVE_TESTS
+ float (*f1_pred) (svbool_t, float);
+ float (*f2_pred) (svbool_t, float, float);
+ double (*d1_pred) (svbool_t, double);
+ double (*d2_pred) (svbool_t, double, double);
+#endif
} fun;
union
{
@@ -294,44 +300,33 @@ struct fun
#endif
};
+// clang-format off
static const struct fun fun[] = {
#if USE_MPFR
-# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
- {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+ { #x, a, s, twice, 0, { .t = x_wrap }, { .t = x_long }, { .t = x_mpfr } },
+# define SVF(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+ { #x, a, s, twice, 1, { .t##_pred = x_wrap }, { .t = x_long }, { .t = x_mpfr } },
#else
-# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
- {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}},
+# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+ { #x, a, s, twice, 0, { .t = x_wrap }, { .t = x_long } },
+# define SVF(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \
+ { #x, a, s, twice, 1, { .t##_pred = x_wrap }, { .t = x_long } },
#endif
#define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0)
#define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
#define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
/* Neon routines. */
-#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define ZVNF1(x) VNF1 (x) ZVF1 (x)
-#define ZVNF2(x) VNF2 (x) ZVF2 (x)
-#define ZVND1(x) VND1 (x) ZVD1 (x)
-#define ZVND2(x) VND2 (x) ZVD2 (x)
+#define ZVNF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZVNF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZVND1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZVND2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
/* SVE routines. */
-#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZSVF1(x) SVF (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZSVF2(x) SVF (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZSVD1(x) SVF (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZSVD2(x) SVF (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
#include "test/ulp_funcs.h"
@@ -340,11 +335,13 @@ static const struct fun fun[] = {
#undef F2
#undef D1
#undef D2
-#undef SVF1
-#undef SVF2
-#undef SVD1
-#undef SVD2
- {0}};
+#undef ZSVF1
+#undef ZSVF2
+#undef ZSVD1
+#undef ZSVD2
+ { 0 }
+};
+// clang-format on
/* Boilerplate for generic calls. */
@@ -365,24 +362,40 @@ ulpscale_d (double x)
return e - 0x3ff - 52;
}
static inline float
-call_f1 (const struct fun *f, struct args_f1 a)
+call_f1 (const struct fun *f, struct args_f1 a, const struct conf *conf)
{
+#if WANT_SVE_TESTS
+ if (f->is_predicated)
+ return f->fun.f1_pred (*conf->pg, a.x);
+#endif
return f->fun.f1 (a.x);
}
static inline float
-call_f2 (const struct fun *f, struct args_f2 a)
+call_f2 (const struct fun *f, struct args_f2 a, const struct conf *conf)
{
+#if WANT_SVE_TESTS
+ if (f->is_predicated)
+ return f->fun.f2_pred (*conf->pg, a.x, a.x2);
+#endif
return f->fun.f2 (a.x, a.x2);
}
static inline double
-call_d1 (const struct fun *f, struct args_d1 a)
+call_d1 (const struct fun *f, struct args_d1 a, const struct conf *conf)
{
+#if WANT_SVE_TESTS
+ if (f->is_predicated)
+ return f->fun.d1_pred (*conf->pg, a.x);
+#endif
return f->fun.d1 (a.x);
}
static inline double
-call_d2 (const struct fun *f, struct args_d2 a)
+call_d2 (const struct fun *f, struct args_d2 a, const struct conf *conf)
{
+#if WANT_SVE_TESTS
+ if (f->is_predicated)
+ return f->fun.d2_pred (*conf->pg, a.x, a.x2);
+#endif
return f->fun.d2 (a.x, a.x2);
}
static inline double
@@ -595,6 +608,11 @@ usage (void)
" This should be different from tested input in other lanes, and non-special \n"
" (i.e. should not trigger fenv exceptions). Default is 1.");
#endif
+#if WANT_SVE_TESTS
+ puts ("-p: integer input for controlling predicate passed to SVE function. "
+ "If bit N is set, lane N is activated (bits past the vector length "
+ "are ignored). Default is UINT64_MAX (ptrue).");
+#endif
puts ("-z: ignore sign of 0.");
puts ("Supported func:");
for (const struct fun *f = fun; f->name; f++)
@@ -633,9 +651,21 @@ getnum (const char *s, int singleprec)
sign = singleprec ? 1ULL << 31 : 1ULL << 63;
s++;
}
+
+ /* Sentinel value for failed parse. */
+ char *should_not_be_s = NULL;
+
/* 0xXXXX is treated as bit representation, '-' flips the sign bit. */
if (s[0] == '0' && tolower (s[1]) == 'x' && strchr (s, 'p') == 0)
- return sign ^ strtoull (s, 0, 0);
+ {
+ uint64_t out = sign ^ strtoull (s, &should_not_be_s, 0);
+ if (should_not_be_s == s)
+ {
+ printf ("ERROR: Could not parse '%s'\n", s);
+ exit (1);
+ }
+ return out;
+ }
// /* SNaN, QNaN, NaN, Inf. */
// for (i=0; s[i] && i < sizeof buf; i++)
// buf[i] = tolower(s[i]);
@@ -647,8 +677,16 @@ getnum (const char *s, int singleprec)
// if (strcmp(buf, "inf") == 0 || strcmp(buf, "infinity") == 0)
// return sign | (singleprec ? 0x7f800000 : 0x7ff0000000000000);
/* Otherwise assume it's a floating-point literal. */
- return sign
- | (singleprec ? asuint (strtof (s, 0)) : asuint64 (strtod (s, 0)));
+ uint64_t out = sign
+ | (singleprec ? asuint (strtof (s, &should_not_be_s))
+ : asuint64 (strtod (s, &should_not_be_s)));
+ if (should_not_be_s == s)
+ {
+ printf ("ERROR: Could not parse '%s'\n", s);
+ exit (1);
+ }
+
+ return out;
}
static void
@@ -720,6 +758,9 @@ main (int argc, char *argv[])
conf.softlim = 0;
conf.errlim = INFINITY;
conf.ignore_zero_sign = 0;
+#if WANT_SVE_TESTS
+ uint64_t pg_int = UINT64_MAX;
+#endif
for (;;)
{
argc--;
@@ -767,7 +808,7 @@ main (int argc, char *argv[])
case 'z':
conf.ignore_zero_sign = 1;
break;
-#ifdef __vpcs
+#if __aarch64__ && __linux__
case 'c':
argc--;
argv++;
@@ -775,6 +816,13 @@ main (int argc, char *argv[])
dv[0] = strtod(argv[0], 0);
break;
#endif
+#if WANT_SVE_TESTS
+ case 'p':
+ argc--;
+ argv++;
+ pg_int = strtoull (argv[0], 0, 0);
+ break;
+#endif
default:
usage ();
}
@@ -806,7 +854,7 @@ main (int argc, char *argv[])
if (strncmp (argv[0], "_ZGVnN", 6) == 0)
exit (0);
#endif
-#if !WANT_SVE_MATH
+#if !WANT_SVE_TESTS
if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
exit (0);
#endif
@@ -824,5 +872,13 @@ main (int argc, char *argv[])
argv++;
parsegen (&gen, argc, argv, f);
conf.n = gen.cnt;
+#if WANT_SVE_TESTS
+ svbool_t pg = parse_pg (pg_int, f->singleprec);
+ conf.pg = &pg;
+#endif
return cmp (f, &gen, &conf);
}
+
+#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/math/test/ulp.h b/math/test/ulp.h
index b0bc59aeef8d..de122257d3b1 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -1,13 +1,13 @@
/*
* Generic functions for ULP error estimation.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2019-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* For each different math function type,
T(x) should add a different suffix to x.
- RT(x) should add a return type specific suffix to x. */
+ RT(x) should add a return type specific suffix to x. */
#ifdef NEW_RT
#undef NEW_RT
@@ -47,8 +47,12 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
if (RT(asuint) (got) == RT(asuint) (want))
return 0.0;
if (isnan (got) && isnan (want))
- /* Ignore sign of NaN. */
+ /* Ignore sign of NaN, and signalling-ness for MPFR. */
+# if USE_MPFR
+ return 0;
+# else
return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
+# endif
if (signbit (got) != signbit (want))
{
/* Fall through to ULP calculation if ignoring sign of zero and at
@@ -80,7 +84,7 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
// TODO: incorrect when got vs want cross a powof2 boundary
/* error = got > want
? got - want - tail ulp - 0.5 ulp
- : got - want - tail ulp + 0.5 ulp; */
+ : got - want - tail ulp + 0.5 ulp. */
d = got - want;
e = d > 0 ? -p->tail - 0.5 : -p->tail + 0.5;
}
@@ -108,32 +112,34 @@ static int RT(isok_nofenv) (RT(float) ygot, RT(float) ywant)
}
#endif
-static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
- RT(float) * y, int *ex)
+static inline void T (call_fenv) (const struct fun *f, struct T (args) a,
+ int r, RT (float) * y, int *ex,
+ const struct conf *conf)
{
if (r != FE_TONEAREST)
fesetround (r);
feclearexcept (FE_ALL_EXCEPT);
- *y = T(call) (f, a);
+ *y = T (call) (f, a, conf);
*ex = fetestexcept (FE_ALL_EXCEPT);
if (r != FE_TONEAREST)
fesetround (FE_TONEAREST);
}
-static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
- int r, RT(float) * y, int *ex)
+static inline void T (call_nofenv) (const struct fun *f, struct T (args) a,
+ int r, RT (float) * y, int *ex,
+ const struct conf *conf)
{
if (r != FE_TONEAREST)
fesetround (r);
- *y = T(call) (f, a);
+ *y = T (call) (f, a, conf);
*ex = 0;
if (r != FE_TONEAREST)
fesetround (FE_TONEAREST);
}
-static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
- int r, struct RT(ret) * p,
- RT(float) ygot, int exgot)
+static inline int T (call_long_fenv) (const struct fun *f, struct T (args) a,
+ int r, struct RT (ret) * p,
+ RT (float) ygot, int exgot)
{
if (r != FE_TONEAREST)
fesetround (r);
@@ -269,6 +275,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
int r = conf->r;
int use_mpfr = conf->mpfr;
int fenv = conf->fenv;
+
for (;;)
{
struct RT(ret) want;
@@ -279,15 +286,15 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
RT(float) ygot2;
int fail = 0;
if (fenv)
- T(call_fenv) (f, a, r, &ygot, &exgot);
+ T (call_fenv) (f, a, r, &ygot, &exgot, conf);
else
- T(call_nofenv) (f, a, r, &ygot, &exgot);
+ T (call_nofenv) (f, a, r, &ygot, &exgot, conf);
if (f->twice) {
secondcall = 1;
if (fenv)
- T(call_fenv) (f, a, r, &ygot2, &exgot2);
+ T (call_fenv) (f, a, r, &ygot2, &exgot2, conf);
else
- T(call_nofenv) (f, a, r, &ygot2, &exgot2);
+ T (call_nofenv) (f, a, r, &ygot2, &exgot2, conf);
secondcall = 0;
if (RT(asuint) (ygot) != RT(asuint) (ygot2))
{
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
index 84f7927d3935..b58a68ff275b 100644
--- a/math/test/ulp_funcs.h
+++ b/math/test/ulp_funcs.h
@@ -1,40 +1,109 @@
/*
* Function entries for ulp.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* clang-format off */
- F1 (sin)
- F1 (cos)
F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp10)
- D1 (exp2)
- D1 (log)
- D1 (log2)
D2 (pow)
- D1 (erf)
-#ifdef __vpcs
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+#if __aarch64__ && __linux__
F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+ F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0)
+ F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0)
+ F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0)
+ F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0)
+ F (_ZGVnN4vl4_modff_frac, v_modff_frac, modf_frac, modf_mpfr_frac, 1, 1, f1, 0)
+ F (_ZGVnN4vl4_modff_int, v_modff_int, modf_int, modf_mpfr_int, 1, 1, f1, 0)
+ F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+ F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+ F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+ F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+ F (_ZGVnN2vl8_modf_frac, v_modf_frac, modfl_frac, modf_mpfr_frac, 1, 0, d1, 0)
+ F (_ZGVnN2vl8_modf_int, v_modf_int, modfl_int, modf_mpfr_int, 1, 0, d1, 0)
#endif
-/* clang-format on */
+
+#if WANT_SVE_TESTS
+SVF (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0)
+SVF (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0)
+SVF (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0)
+SVF (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0)
+SVF (_ZGVsMxvl4_modff_frac, sv_modff_frac, modf_frac, modf_mpfr_frac, 1, 1, f1, 0)
+SVF (_ZGVsMxvl4_modff_int, sv_modff_int, modf_int, modf_mpfr_int, 1, 1, f1, 0)
+SVF (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+SVF (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+SVF (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+SVF (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+SVF (_ZGVsMxvl8_modf_frac, sv_modf_frac, modfl_frac, modf_mpfr_frac, 1, 0, d1, 0)
+SVF (_ZGVsMxvl8_modf_int, sv_modf_int, modfl_int, modf_mpfr_int, 1, 0, d1, 0)
+#endif
+
+#if WANT_EXPERIMENTAL_MATH
+ F (arm_math_erff, arm_math_erff, erf, mpfr_erf, 1, 1, f1, 0)
+ F (arm_math_erf, arm_math_erf, erfl, mpfr_erf, 1, 0, d1, 0)
+#endif
+
+#if WANT_TRIGPI_TESTS
+ F (arm_math_cospif, arm_math_cospif, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0)
+ F (arm_math_cospi, arm_math_cospi, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0)
+ F (arm_math_sinpif, arm_math_sinpif, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0)
+ F (arm_math_sinpi, arm_math_sinpi, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0)
+ F (arm_math_tanpif, arm_math_tanpif, arm_math_tanpi, mpfr_tanpi, 1, 1, f1, 0)
+ F (arm_math_tanpi, arm_math_tanpi, arm_math_tanpil, mpfr_tanpi, 1, 0, d1, 0)
+ F (arm_math_sincospif_sin, arm_math_sincospif_sin, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0)
+ F (arm_math_sincospif_cos, arm_math_sincospif_cos, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0)
+ F (arm_math_sincospi_sin, arm_math_sincospi_sin, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0)
+ F (arm_math_sincospi_cos, arm_math_sincospi_cos, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0)
+# if __aarch64__ && __linux__
+ F (_ZGVnN4v_cospif, Z_cospif, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0)
+ F (_ZGVnN2v_cospi, Z_cospi, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0)
+ F (_ZGVnN4v_sinpif, Z_sinpif, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0)
+ F (_ZGVnN2v_sinpi, Z_sinpi, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0)
+ F (_ZGVnN4v_tanpif, Z_tanpif, arm_math_tanpi, mpfr_tanpi, 1, 1, f1, 0)
+ F (_ZGVnN2v_tanpi, Z_tanpi, arm_math_tanpil, mpfr_tanpi, 1, 0, d1, 0)
+ F (_ZGVnN4v_sincospif_sin, v_sincospif_sin, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0)
+ F (_ZGVnN4v_sincospif_cos, v_sincospif_cos, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0)
+ F (_ZGVnN2v_sincospi_sin, v_sincospi_sin, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0)
+ F (_ZGVnN2v_sincospi_cos, v_sincospi_cos, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0)
+# endif
+# if WANT_SVE_TESTS
+ SVF (_ZGVsMxv_cospif, Z_sv_cospif, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0)
+ SVF (_ZGVsMxv_cospi, Z_sv_cospi, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0)
+ SVF (_ZGVsMxv_sinpif, Z_sv_sinpif, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0)
+ SVF (_ZGVsMxv_sinpi, Z_sv_sinpi, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0)
+ SVF (_ZGVsMxv_tanpif, Z_sv_tanpif, arm_math_tanpi, mpfr_tanpi, 1, 1, f1, 0)
+ SVF (_ZGVsMxv_tanpi, Z_sv_tanpi, arm_math_tanpil, mpfr_tanpi, 1, 0, d1, 0)
+ SVF (_ZGVsMxvl4l4_sincospif_sin, sv_sincospif_sin, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0)
+ SVF (_ZGVsMxvl4l4_sincospif_cos, sv_sincospif_cos, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0)
+ SVF (_ZGVsMxvl8l8_sincospi_sin, sv_sincospi_sin, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0)
+ SVF (_ZGVsMxvl8l8_sincospi_cos, sv_sincospi_cos, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0)
+# if WANT_EXPERIMENTAL_MATH
+SVF (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
+SVF (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
+# endif
+# endif
+#endif
+
+ /* clang-format on */
+
+#define _ZSF1(f) F1 (f)
+#define _ZSF2(f) F2 (f)
+#define _ZSD1(f) D1 (f)
+#define _ZSD2(f) D2 (f)
+
+#define _ZVF1(f) ZVNF1 (f)
+#define _ZVD1(f) ZVND1 (f)
+#define _ZVF2(f) ZVNF2 (f)
+#define _ZVD2(f) ZVND2 (f)
+
+#define _ZSVF1(f) ZSVF1 (f)
+#define _ZSVF2(f) ZSVF2 (f)
+#define _ZSVD1(f) ZSVD1 (f)
+#define _ZSVD2(f) ZSVD2 (f)
+
+#include "test/ulp_funcs_gen.h"
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
index 60dc3d6dd652..33e1e75f23ab 100644
--- a/math/test/ulp_wrappers.h
+++ b/math/test/ulp_wrappers.h
@@ -1,12 +1,18 @@
/*
* Function wrappers for ulp.
*
- * Copyright (c) 2022-2023, Arm Limited.
+ * Copyright (c) 2022-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* clang-format off */
+#if __aarch64__ && __linux__
+#include <arm_neon.h>
+#endif
+
+#include <stdbool.h>
+
/* Wrappers for sincos. */
static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
@@ -15,23 +21,409 @@ static double sincos_cos(double x) {(void)sin(x); return cos(x);}
#if USE_MPFR
static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+static int modf_mpfr_frac(mpfr_t f, const mpfr_t x, mpfr_rnd_t r) { MPFR_DECL_INIT(i, 80); return mpfr_modf(i,f,x,r); }
+static int modf_mpfr_int(mpfr_t i, const mpfr_t x, mpfr_rnd_t r) { MPFR_DECL_INIT(f, 80); return mpfr_modf(i,f,x,r); }
+# if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0)
+static int mpfr_tanpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) {
+ MPFR_DECL_INIT (frd, 1080);
+ mpfr_const_pi (frd, GMP_RNDN);
+ mpfr_mul (frd, frd, arg, GMP_RNDN);
+ return mpfr_tan (ret, frd, GMP_RNDN);
+}
+static int mpfr_sinpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) {
+ MPFR_DECL_INIT (frd, 1080);
+ mpfr_const_pi (frd, GMP_RNDN);
+ mpfr_mul (frd, frd, arg, GMP_RNDN);
+ return mpfr_sin (ret, frd, GMP_RNDN);
+}
+
+static int mpfr_cospi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) {
+ MPFR_DECL_INIT (frd, 1080);
+ mpfr_const_pi (frd, GMP_RNDN);
+ mpfr_mul (frd, frd, arg, GMP_RNDN);
+ return mpfr_cos (ret, frd, GMP_RNDN);
+}
+# endif
+# if WANT_EXPERIMENTAL_MATH
+static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) {
+ mpfr_t y2;
+ mpfr_init(y2);
+ mpfr_trunc(y2, y);
+ return mpfr_pow(ret, x, y2, rnd);
+}
+# endif
#endif
+float modff_frac(float x) { float i; return modff(x, &i); }
+float modff_int(float x) { float i; modff(x, &i); return i; }
+double modf_frac(double x) { double i; return modf(x, &i); }
+double modf_int(double x) { double i; modf(x, &i); return i; }
+long double modfl_frac(long double x) { long double i; return modfl(x, &i); }
+long double modfl_int(long double x) { long double i; modfl(x, &i); return i; }
+
/* Wrappers for vector functions. */
-#ifdef __vpcs
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+#if __aarch64__ && __linux__
static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
#endif
/* clang-format on */
+
+/* No wrappers for scalar routines, but TEST_SIG will emit them. */
+#define ZSNF1_WRAP(func)
+#define ZSNF2_WRAP(func)
+#define ZSND1_WRAP(func)
+#define ZSND2_WRAP(func)
+
+#define ZVNF1_WRAP(func) \
+ static float Z_##func##f (float x) \
+ { \
+ return _ZGVnN4v_##func##f (argf (x))[0]; \
+ }
+#define ZVNF2_WRAP(func) \
+ static float Z_##func##f (float x, float y) \
+ { \
+ return _ZGVnN4vv_##func##f (argf (x), argf (y))[0]; \
+ }
+#define ZVND1_WRAP(func) \
+ static double Z_##func (double x) { return _ZGVnN2v_##func (argd (x))[0]; }
+#define ZVND2_WRAP(func) \
+ static double Z_##func (double x, double y) \
+ { \
+ return _ZGVnN2vv_##func (argd (x), argd (y))[0]; \
+ }
+
+#if WANT_TRIGPI_TESTS
+float
+arm_math_sincospif_sin (float x)
+{
+ float s, c;
+ arm_math_sincospif (x, &s, &c);
+ return s;
+}
+float
+arm_math_sincospif_cos (float x)
+{
+ float s, c;
+ arm_math_sincospif (x, &s, &c);
+ return c;
+}
+double
+arm_math_sincospi_sin (double x)
+{
+ double s, c;
+ arm_math_sincospi (x, &s, &c);
+ return s;
+}
+double
+arm_math_sincospi_cos (double x)
+{
+ double s, c;
+ arm_math_sincospi (x, &s, &c);
+ return c;
+}
+#endif
+
+#if __aarch64__ && __linux__
+
+# if WANT_TRIGPI_TESTS
+ZVNF1_WRAP (cospi)
+ZVND1_WRAP (cospi)
+ZVNF1_WRAP (sinpi)
+ZVND1_WRAP (sinpi)
+ZVNF1_WRAP (tanpi)
+ZVND1_WRAP (tanpi)
+
+double
+v_sincospi_sin (double x)
+{
+ double s[2], c[2];
+ _ZGVnN2vl8l8_sincospi (vdupq_n_f64 (x), s, c);
+ return s[0];
+}
+double
+v_sincospi_cos (double x)
+{
+ double s[2], c[2];
+ _ZGVnN2vl8l8_sincospi (vdupq_n_f64 (x), s, c);
+ return c[0];
+}
+float
+v_sincospif_sin (float x)
+{
+ float s[4], c[4];
+ _ZGVnN4vl4l4_sincospif (vdupq_n_f32 (x), s, c);
+ return s[0];
+}
+float
+v_sincospif_cos (float x)
+{
+ float s[4], c[4];
+ _ZGVnN4vl4l4_sincospif (vdupq_n_f32 (x), s, c);
+ return c[0];
+}
+# endif // WANT_TRIGPI_TESTS
+
+float
+v_sincosf_sin (float x)
+{
+ float s[4], c[4];
+ _ZGVnN4vl4l4_sincosf (vdupq_n_f32 (x), s, c);
+ return s[0];
+}
+float
+v_sincosf_cos (float x)
+{
+ float s[4], c[4];
+ _ZGVnN4vl4l4_sincosf (vdupq_n_f32 (x), s, c);
+ return c[0];
+}
+float
+v_cexpif_sin (float x)
+{
+ return _ZGVnN4v_cexpif (vdupq_n_f32 (x)).val[0][0];
+}
+float
+v_cexpif_cos (float x)
+{
+ return _ZGVnN4v_cexpif (vdupq_n_f32 (x)).val[1][0];
+}
+float
+v_modff_frac (float x)
+{
+ float y[4];
+ return _ZGVnN4vl4_modff (vdupq_n_f32 (x), y)[0];
+}
+float
+v_modff_int (float x)
+{
+ float y[4];
+ _ZGVnN4vl4_modff (vdupq_n_f32 (x), y);
+ return y[0];
+}
+double
+v_sincos_sin (double x)
+{
+ double s[2], c[2];
+ _ZGVnN2vl8l8_sincos (vdupq_n_f64 (x), s, c);
+ return s[0];
+}
+double
+v_sincos_cos (double x)
+{
+ double s[2], c[2];
+ _ZGVnN2vl8l8_sincos (vdupq_n_f64 (x), s, c);
+ return c[0];
+}
+double
+v_cexpi_sin (double x)
+{
+ return _ZGVnN2v_cexpi (vdupq_n_f64 (x)).val[0][0];
+}
+double
+v_cexpi_cos (double x)
+{
+ return _ZGVnN2v_cexpi (vdupq_n_f64 (x)).val[1][0];
+}
+double
+v_modf_frac (double x)
+{
+ double y[2];
+ return _ZGVnN2vl8_modf (vdupq_n_f64 (x), y)[0];
+}
+double
+v_modf_int (double x)
+{
+ double y[2];
+ _ZGVnN2vl8_modf (vdupq_n_f64 (x), y);
+ return y[0];
+}
+#endif // __aarch64__ && __linux__
+
+#if WANT_SVE_TESTS
+# define ZSVNF1_WRAP(func) \
+ static float Z_sv_##func##f (svbool_t pg, float x) \
+ { \
+ return svretf (_ZGVsMxv_##func##f (svargf (x), pg), pg); \
+ }
+# define ZSVNF2_WRAP(func) \
+ static float Z_sv_##func##f (svbool_t pg, float x, float y) \
+ { \
+ return svretf (_ZGVsMxvv_##func##f (svargf (x), svargf (y), pg), pg); \
+ }
+# define ZSVND1_WRAP(func) \
+ static double Z_sv_##func (svbool_t pg, double x) \
+ { \
+ return svretd (_ZGVsMxv_##func (svargd (x), pg), pg); \
+ }
+# define ZSVND2_WRAP(func) \
+ static double Z_sv_##func (svbool_t pg, double x, double y) \
+ { \
+ return svretd (_ZGVsMxvv_##func (svargd (x), svargd (y), pg), pg); \
+ }
+
+# if WANT_TRIGPI_TESTS
+ZSVNF1_WRAP (cospi)
+ZSVND1_WRAP (cospi)
+ZSVNF1_WRAP (sinpi)
+ZSVND1_WRAP (sinpi)
+ZSVNF1_WRAP (tanpi)
+ZSVND1_WRAP (tanpi)
+double
+sv_sincospi_sin (svbool_t pg, double x)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincospi (svdup_f64 (x), s, c, pg);
+ return svretd (svld1 (pg, s), pg);
+}
+double
+sv_sincospi_cos (svbool_t pg, double x)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincospi (svdup_f64 (x), s, c, pg);
+ return svretd (svld1 (pg, c), pg);
+}
+float
+sv_sincospif_sin (svbool_t pg, float x)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincospif (svdup_f32 (x), s, c, pg);
+ return svretf (svld1 (pg, s), pg);
+}
+float
+sv_sincospif_cos (svbool_t pg, float x)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincospif (svdup_f32 (x), s, c, pg);
+ return svretf (svld1 (pg, c), pg);
+}
+# endif // WANT_TRIGPI_TESTS
+
+float
+sv_sincosf_sin (svbool_t pg, float x)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincosf (svdup_f32 (x), s, c, pg);
+ return svretf (svld1 (pg, s), pg);
+}
+float
+sv_sincosf_cos (svbool_t pg, float x)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincosf (svdup_f32 (x), s, c, pg);
+ return svretf (svld1 (pg, c), pg);
+}
+float
+sv_cexpif_sin (svbool_t pg, float x)
+{
+ return svretf (svget2 (_ZGVsMxv_cexpif (svdup_f32 (x), pg), 0), pg);
+}
+float
+sv_cexpif_cos (svbool_t pg, float x)
+{
+ return svretf (svget2 (_ZGVsMxv_cexpif (svdup_f32 (x), pg), 1), pg);
+}
+float
+sv_modff_frac (svbool_t pg, float x)
+{
+ float i[svcntw ()];
+ return svretf (_ZGVsMxvl4_modff (svdup_f32 (x), i, pg), pg);
+}
+float
+sv_modff_int (svbool_t pg, float x)
+{
+ float i[svcntw ()];
+ _ZGVsMxvl4_modff (svdup_f32 (x), i, pg);
+ return svretf (svld1 (pg, i), pg);
+}
+double
+sv_sincos_sin (svbool_t pg, double x)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincos (svdup_f64 (x), s, c, pg);
+ return svretd (svld1 (pg, s), pg);
+}
+double
+sv_sincos_cos (svbool_t pg, double x)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincos (svdup_f64 (x), s, c, pg);
+ return svretd (svld1 (pg, c), pg);
+}
+double
+sv_cexpi_sin (svbool_t pg, double x)
+{
+ return svretd (svget2 (_ZGVsMxv_cexpi (svdup_f64 (x), pg), 0), pg);
+}
+double
+sv_cexpi_cos (svbool_t pg, double x)
+{
+ return svretd (svget2 (_ZGVsMxv_cexpi (svdup_f64 (x), pg), 1), pg);
+}
+double
+sv_modf_frac (svbool_t pg, double x)
+{
+ double i[svcntd ()];
+ return svretd (_ZGVsMxvl8_modf (svdup_f64 (x), i, pg), pg);
+}
+double
+sv_modf_int (svbool_t pg, double x)
+{
+ double i[svcntd ()];
+ _ZGVsMxvl8_modf (svdup_f64 (x), i, pg);
+ return svretd (svld1 (pg, i), pg);
+}
+
+# if WANT_EXPERIMENTAL_MATH
+
+/* Our implementations of powi/powk are too imprecise to verify
+ against any established pow implementation. Instead we have the
+ following simple implementation, against which it is enough to
+ maintain bitwise reproducibility. Note the test framework expects
+ the reference impl to be of higher precision than the function
+ under test. For instance this means that the reference for
+ double-precision powi will be passed a long double, so to check
+ bitwise reproducibility we have to cast it back down to
+ double. This is fine since a round-trip to higher precision and
+ back down is correctly rounded. */
+# define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \
+ static DBL_T __attribute__ ((unused)) NAME (DBL_T in_val, DBL_T y) \
+ { \
+ INT_T n = (INT_T) round (y); \
+ FLT_T acc = 1.0; \
+ bool want_recip = n < 0; \
+ n = n < 0 ? -n : n; \
+ \
+ for (FLT_T c = in_val; n; c *= c, n >>= 1) \
+ { \
+ if (n & 0x1) \
+ { \
+ acc *= c; \
+ } \
+ } \
+ if (want_recip) \
+ { \
+ acc = 1.0 / acc; \
+ } \
+ return acc; \
+ }
+
+DECL_POW_INT_REF (ref_powif, double, float, int)
+DECL_POW_INT_REF (ref_powi, long double, double, int)
+static float
+Z_sv_powi (svbool_t pg, float x, float y)
+{
+ return svretf (_ZGVsMxvv_powi (svargf (x), svdup_s32 ((int) round (y)), pg),
+ pg);
+}
+static double
+Z_sv_powk (svbool_t pg, double x, double y)
+{
+ return svretd (_ZGVsMxvv_powk (svargd (x), svdup_s64 ((long) round (y)), pg),
+ pg);
+}
+
+# endif // WANT_EXPERIMENTAL_MATH
+#endif // WANT_SVE_TESTS
+
+#include "test/ulp_wrappers_gen.h"
diff --git a/math/tgamma128.c b/math/tgamma128.c
index 65deacc49d99..d6049207b91f 100644
--- a/math/tgamma128.c
+++ b/math/tgamma128.c
@@ -338,6 +338,8 @@ long double tgamma128(long double x)
mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
1140.0L+t*(231.25L+t*(24.0L+t)))));
break;
+ default:
+ __builtin_unreachable();
}
}
diff --git a/pl/math/tools/asin.sollya b/math/tools/asin.sollya
index 8ef861d0898b..02c4a93356c3 100644
--- a/pl/math/tools/asin.sollya
+++ b/math/tools/asin.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating asin(x)
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
f = asin(x);
diff --git a/pl/math/tools/asinf.sollya b/math/tools/asinf.sollya
index 5b627e546c73..69d1803875d1 100644
--- a/pl/math/tools/asinf.sollya
+++ b/math/tools/asinf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating asinf(x)
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
f = asin(x);
diff --git a/pl/math/tools/asinh.sollya b/math/tools/asinh.sollya
index 663ee92f3f34..eea9b8081168 100644
--- a/pl/math/tools/asinh.sollya
+++ b/math/tools/asinh.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating asinh(x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so
diff --git a/pl/math/tools/asinhf.sollya b/math/tools/asinhf.sollya
index ab115b53b8dc..5f1580fce883 100644
--- a/pl/math/tools/asinhf.sollya
+++ b/math/tools/asinhf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating asinh(x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 9;
diff --git a/pl/math/tools/atan.sollya b/math/tools/atan.sollya
index ad4f33b8516a..048017d8d269 100644
--- a/pl/math/tools/atan.sollya
+++ b/math/tools/atan.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating atan(x) and atan2(y, x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// atan is odd, so approximate with an odd polynomial:
diff --git a/pl/math/tools/atanf.sollya b/math/tools/atanf.sollya
index ed88d0ba90f9..21c3ba2bfa1d 100644
--- a/pl/math/tools/atanf.sollya
+++ b/math/tools/atanf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating atanf(x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// Generate list of monomials:
diff --git a/pl/math/tools/cbrt.sollya b/math/tools/cbrt.sollya
index 1d43dc73d8cd..2490a69ac029 100644
--- a/pl/math/tools/cbrt.sollya
+++ b/math/tools/cbrt.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating cbrt(x) in double precision
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 3;
diff --git a/pl/math/tools/cbrtf.sollya b/math/tools/cbrtf.sollya
index 4e0cc69b46a5..1debf930e722 100644
--- a/pl/math/tools/cbrtf.sollya
+++ b/math/tools/cbrtf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating cbrt(x) in single precision
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 3;
diff --git a/pl/math/tools/erf.sollya b/math/tools/erf.sollya
index b2fc559b511e..060e1686c835 100644
--- a/pl/math/tools/erf.sollya
+++ b/math/tools/erf.sollya
@@ -1,6 +1,6 @@
// tables and constants for approximating erf(x).
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
display = hexadecimal;
diff --git a/pl/math/tools/erfc.sollya b/math/tools/erfc.sollya
index 1e2791291ebb..1b4b00066093 100644
--- a/pl/math/tools/erfc.sollya
+++ b/math/tools/erfc.sollya
@@ -1,6 +1,6 @@
// tables and constants for approximating erfc(x).
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
display = hexadecimal;
diff --git a/pl/math/tools/erfcf.sollya b/math/tools/erfcf.sollya
index 1d7fc264d99d..a8e0409f5db5 100644
--- a/pl/math/tools/erfcf.sollya
+++ b/math/tools/erfcf.sollya
@@ -1,6 +1,6 @@
// tables and constants for approximating erfcf(x).
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
display = hexadecimal;
diff --git a/pl/math/tools/erff.sollya b/math/tools/erff.sollya
index 59b23ef021f0..c0178a2b24ad 100644
--- a/pl/math/tools/erff.sollya
+++ b/math/tools/erff.sollya
@@ -1,6 +1,6 @@
// tables and constants for approximating erff(x).
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
display = hexadecimal;
diff --git a/pl/math/tools/exp10.sollya b/math/tools/exp10.sollya
index 9f30b4018209..91f92595b96d 100644
--- a/pl/math/tools/exp10.sollya
+++ b/math/tools/exp10.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating 10^x
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// exp10f parameters
diff --git a/pl/math/tools/expm1.sollya b/math/tools/expm1.sollya
index 7b6f324eb247..d87466a066af 100644
--- a/pl/math/tools/expm1.sollya
+++ b/math/tools/expm1.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating exp(x)-1 in double precision
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 12;
diff --git a/pl/math/tools/expm1f.sollya b/math/tools/expm1f.sollya
index efdf1bd301e0..bb9496f3f2c4 100644
--- a/pl/math/tools/expm1f.sollya
+++ b/math/tools/expm1f.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating exp(x)-1 in single precision
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 5;
diff --git a/pl/math/tools/log10.sollya b/math/tools/log10.sollya
index 85d1d15c1698..78f956b14b95 100644
--- a/pl/math/tools/log10.sollya
+++ b/math/tools/log10.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating log10(1+x)
//
-// Copyright (c) 2019-2023, Arm Limited.
+// Copyright (c) 2019-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 6; // poly degree
diff --git a/pl/math/tools/log10f.sollya b/math/tools/log10f.sollya
index 94bf32f2c449..c64a30aa8e18 100644
--- a/pl/math/tools/log10f.sollya
+++ b/math/tools/log10f.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating log10f(1+x)
//
-// Copyright (c) 2019-2023, Arm Limited.
+// Copyright (c) 2019-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// Computation of log10f(1+x) will be carried out in double precision
diff --git a/pl/math/tools/log1p.sollya b/math/tools/log1p.sollya
index 598a36af0339..0cf72081fabb 100644
--- a/pl/math/tools/log1p.sollya
+++ b/math/tools/log1p.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating log(1+x) in double precision
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 20;
diff --git a/pl/math/tools/log1pf.sollya b/math/tools/log1pf.sollya
index cc1db10e4c0c..fc542c937111 100644
--- a/pl/math/tools/log1pf.sollya
+++ b/math/tools/log1pf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating log(1+x) in single precision
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 10;
diff --git a/pl/math/tools/sincos.sollya b/math/tools/sincos.sollya
index 7d36266b446b..600368507f4e 100644
--- a/pl/math/tools/sincos.sollya
+++ b/math/tools/sincos.sollya
@@ -1,9 +1,9 @@
// polynomial for approximating cos(x)
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-// This script only finds the coeffs for cos - see math/aarch64/v_sin.c for sin coeffs
+// This script only finds the coeffs for cos - see math/aarch64/advsimd/sin.c for sin coeffs
deg = 14; // polynomial degree
a = -pi/4; // interval
diff --git a/pl/math/tools/sincosf.sollya b/math/tools/sincosf.sollya
index 178ee83ac196..add874e87a9a 100644
--- a/pl/math/tools/sincosf.sollya
+++ b/math/tools/sincosf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating cos(x)
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// This script only finds the coeffs for cos - see math/tools/sin.sollya for sin coeffs.
diff --git a/pl/math/tools/sinpi.sollya b/math/tools/sinpi.sollya
index 62cc87e7697d..9bc5b1c7fc2a 100644
--- a/pl/math/tools/sinpi.sollya
+++ b/math/tools/sinpi.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating sinpi(x)
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 19; // polynomial degree
diff --git a/pl/math/tools/tan.sollya b/math/tools/tan.sollya
index bb0bb28270e3..ca8a170bedaa 100644
--- a/pl/math/tools/tan.sollya
+++ b/math/tools/tan.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating double precision tan(x)
//
-// Copyright (c) 2023, Arm Limited.
+// Copyright (c) 2023-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 8;
diff --git a/pl/math/tools/tanf.sollya b/math/tools/tanf.sollya
index f4b49b40ae64..054d3db44046 100644
--- a/pl/math/tools/tanf.sollya
+++ b/math/tools/tanf.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating single precision tan(x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
dtype = single;
diff --git a/math/tools/tanpi.sollya b/math/tools/tanpi.sollya
new file mode 100644
index 000000000000..8edbc359ab8e
--- /dev/null
+++ b/math/tools/tanpi.sollya
@@ -0,0 +1,48 @@
+// polynomial for approximating tanpi/f(x)
+//
+// Copyright (c) 2024, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// 0 for tanpi/f [0,0.25], 1 for tanpi/f [0.25,1]
+method = 0;
+dtype = double;
+
+if (dtype == single) then {
+ if (method == 0) then { deg = 5; }
+ else if (method == 1) then { deg = 3; };
+} else if (dtype == double) then {
+ if (method == 0) then { deg = 13; }
+ else if (method == 1) then { deg = 8; };
+};
+
+a = 0x1.0p-126;
+b = 1/4;
+
+if (method == 0) then {
+ g = tan(pi * x);
+ F = proc(P) { return pi * x + x^3 * P(x^2); };
+ f = (g(sqrt(x)) - pi * sqrt(x))/(x^(3/2));
+} else if (method == 1) then {
+ g = 1/tan(pi * x);
+ F = proc(P) { return 1/(pi * x) + x * P(x^2); };
+ f = (g(sqrt(x)) / sqrt(x)) - 1/(pi * x);
+};
+
+poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]);
+
+//
+// Display coefficients in Sollya
+//
+display = hexadecimal!;
+if (dtype==double) then { prec = 53!; }
+else if (dtype==single) then { prec = 23!; };
+print("_coeffs :_ hex");
+for i from 0 to deg do coeff(poly, i);
+
+// Compute errors
+//display = hexadecimal!;
+d_rel_err = dirtyinfnorm(1-F(poly)/g(x), [a;b]);
+d_abs_err = dirtyinfnorm(g(x)-F(poly), [a;b]);
+print("dirty rel error:", d_rel_err);
+print("dirty abs error:", d_abs_err);
+print("in [",a,b,"]");
diff --git a/pl/math/tools/v_erf.sollya b/math/tools/v_erf.sollya
index 394ba377df12..5d7795842bcd 100644
--- a/pl/math/tools/v_erf.sollya
+++ b/math/tools/v_erf.sollya
@@ -2,7 +2,7 @@
// To generate coefficients for interval i (0 to 47) do:
// $ sollya v_erf.sollya $i
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
scale = 1/8;
diff --git a/pl/math/tools/v_erfc.sollya b/math/tools/v_erfc.sollya
index 3b03ba07863d..764b333d6d25 100644
--- a/pl/math/tools/v_erfc.sollya
+++ b/math/tools/v_erfc.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating erfc(x)*exp(x*x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 12; // poly degree
diff --git a/pl/math/tools/v_log10.sollya b/math/tools/v_log10.sollya
index e2df4364ada0..5181074f6762 100644
--- a/pl/math/tools/v_log10.sollya
+++ b/math/tools/v_log10.sollya
@@ -1,6 +1,6 @@
// polynomial used for __v_log10(x)
//
-// Copyright (c) 2019-2023, Arm Limited.
+// Copyright (c) 2019-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 6; // poly degree
diff --git a/pl/math/tools/v_log10f.sollya b/math/tools/v_log10f.sollya
index 396d5a92302b..4906cb1d2137 100644
--- a/pl/math/tools/v_log10f.sollya
+++ b/math/tools/v_log10f.sollya
@@ -1,6 +1,6 @@
// polynomial for approximating v_log10f(1+x)
//
-// Copyright (c) 2019-2023, Arm Limited.
+// Copyright (c) 2019-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 9; // poly degree
diff --git a/pl/math/tools/v_log2f.sollya b/math/tools/v_log2f.sollya
index 99e050c91b03..337d4830a2ae 100644
--- a/pl/math/tools/v_log2f.sollya
+++ b/math/tools/v_log2f.sollya
@@ -1,6 +1,6 @@
// polynomial used for __v_log2f(x)
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2022-2024, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 9; // poly degree
diff --git a/networking/Dir.mk b/networking/Dir.mk
index 2589e0a1f91c..b3ca2ff335e4 100644
--- a/networking/Dir.mk
+++ b/networking/Dir.mk
@@ -1,6 +1,6 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2020, Arm Limited.
+# Copyright (c) 2019-2025, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/networking
@@ -46,12 +46,12 @@ $(networking-objs): CFLAGS_ALL += $(networking-cflags)
build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os)
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
-build/lib/libnetworkinglib.a: $(networking-lib-objs)
+build/lib/libnetworking.a: $(networking-lib-objs)
rm -f $@
$(AR) rc $@ $^
$(RANLIB) $@
-build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a
+build/bin/test/%: $(B)/test/%.o build/lib/libnetworking.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
build/include/%.h: $(S)/include/%.h
diff --git a/pl/Dir.mk b/pl/Dir.mk
deleted file mode 100644
index 2d007790d241..000000000000
--- a/pl/Dir.mk
+++ /dev/null
@@ -1,21 +0,0 @@
-# Makefile fragment - requires GNU make
-#
-# Copyright (c) 2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-# These targets are defined if we prescribe pl in SUBS.
-# It requires PLSUBS to be set.
-
-$(foreach sub,$(PLSUBS),$(eval include $(srcdir)/pl/$(sub)/Dir.mk))
-
-pl-files := $($(PLSUBS:%=pl/%-files))
-
-all-pl: $(PLSUBS:%=all-pl/%)
-
-check-pl: $(PLSUBS:%=check-pl/%)
-
-install-pl: $(PLSUBS:%=install-pl/%)
-
-clean-pl: $(PLSUBS:%=clean-pl/%)
-
-.PHONY: all-pl check-pl install-pl clean-pl
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
deleted file mode 100644
index 94b26cf3309c..000000000000
--- a/pl/math/Dir.mk
+++ /dev/null
@@ -1,216 +0,0 @@
-# Makefile fragment - requires GNU make
-#
-# Copyright (c) 2019-2024, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-PLM := $(srcdir)/pl/math
-AOR := $(srcdir)/math
-B := build/pl/math
-
-pl-lib-srcs := $(wildcard $(PLM)/*.[cS])
-
-ifeq ($(WANT_SVE_MATH), 0)
-pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs))
-endif
-
-math-test-srcs := \
- $(AOR)/test/mathtest.c \
- $(AOR)/test/mathbench.c \
- $(AOR)/test/ulp.c \
-
-math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
-
-pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
-pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
-
-pl-libs := \
- build/pl/lib/libmathlib.so \
- build/pl/lib/libmathlib.a \
-
-math-tools := \
- build/pl/bin/mathtest \
- build/pl/bin/mathbench \
- build/pl/bin/mathbench_libc \
- build/pl/bin/runulp.sh \
- build/pl/bin/ulp \
-
-math-host-tools := \
- build/pl/bin/rtest \
-
-pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs)))
-math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
-math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
-pl-target-objs := $(pl-lib-objs) $(math-test-objs)
-pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs)
-
-pl/math-files := \
- $(pl-objs) \
- $(pl-libs) \
- $(math-tools) \
- $(math-host-tools) \
- $(pl-includes) \
- $(pl-test-includes) \
-
-all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes)
-
-$(pl-objs): $(pl-includes) $(pl-test-includes)
-$(pl-objs): CFLAGS_PL += $(math-cflags)
-$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
-$(math-host-objs): CC = $(HOST_CC)
-$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
-
-$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags)
-
-build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs)
- # Replace PL_SIG
- cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
-
-build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs)
- # Replace PL_SIG macros with mathbench func entries
- cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
-
-build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs)
- # Replace PL_SIG macros with ULP wrapper declarations
- cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
-
-$(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h
-$(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
-
-$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
-$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
-
-build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os)
- $(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
-
-build/pl/lib/libmathlib.a: $(pl-lib-objs)
- rm -f $@
- $(AR) rc $@ $^
- $(RANLIB) $@
-
-$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
-$(math-tools): LDLIBS += $(math-ldlibs) -lm
-# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
-$(math-tools): CFLAGS_PL += $(math-sve-cflags)
-
-# Some targets to build pl/math/test from math/test sources
-build/pl/math/test/%.o: $(srcdir)/math/test/%.S
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/math/test/%.o: $(srcdir)/math/test/%.c
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/math/test/%.os: $(srcdir)/math/test/%.S
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/math/test/%.os: $(srcdir)/math/test/%.c
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-# Some targets to build pl/ sources using appropriate flags
-build/pl/%.o: $(srcdir)/pl/%.S
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/%.o: $(srcdir)/pl/%.c
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/%.os: $(srcdir)/pl/%.S
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/%.os: $(srcdir)/pl/%.c
- $(CC) $(CFLAGS_PL) -c -o $@ $<
-
-build/pl/bin/rtest: $(math-host-objs)
- $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
-
-build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a
- $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
-
-build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
- $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
-
-# This is not ideal, but allows custom symbols in mathbench to get resolved.
-build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a
- $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm
-
-build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a
- $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
-
-build/pl/include/%.h: $(PLM)/include/%.h
- cp $< $@
-
-build/pl/include/test/%.h: $(PLM)/test/%.h
- cp $< $@
-
-build/pl/bin/%.sh: $(PLM)/test/%.sh
- cp $< $@
-
-pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst)
-pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst)
-
-check-pl/math-test: $(math-tools)
- cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
-
-check-pl/math-rtest: $(math-host-tools) $(math-tools)
- cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags)
-
-ulp-input-dir=$(B)/test/inputs
-
-math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs)))
-math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs)))
-math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs)))
-
-ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs)
-
-$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
-
-$(ulp-input-dir)/%.ulp: $(PLM)/%.c
- mkdir -p $(@D)
- $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
-
-$(ulp-input-dir)/%.fenv: $(PLM)/%.c
- mkdir -p $(@D)
- $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
-
-$(ulp-input-dir)/%.itv: $(PLM)/%.c
- mkdir -p $(dir $@)
- $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@
-
-ulp-lims := $(ulp-input-dir)/limits
-$(ulp-lims): $(math-lib-lims)
- cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
-
-fenv-exps := $(ulp-input-dir)/fenv
-$(fenv-exps): $(math-lib-fenvs)
- cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
-
-ulp-itvs := $(ulp-input-dir)/intervals
-$(ulp-itvs): $(math-lib-itvs)
- cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
-
-check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs)
- WANT_SVE_MATH=$(WANT_SVE_MATH) \
- ULPFLAGS="$(math-ulpflags)" \
- LIMITS=../../../$(ulp-lims) \
- INTERVALS=../../../$(ulp-itvs) \
- FENV=../../../$(fenv-exps) \
- FUNC=$(func) \
- build/pl/bin/runulp.sh $(EMULATOR)
-
-check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
-
-$(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so
- $(INSTALL) -D $< $@
-
-$(DESTDIR)$(libdir)/pl/%: build/pl/lib/%
- $(INSTALL) -m 644 -D $< $@
-
-$(DESTDIR)$(includedir)/pl/%: build/pl/include/%
- $(INSTALL) -m 644 -D $< $@
-
-install-pl/math: \
- $(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
- $(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
-
-clean-pl/math:
- rm -f $(pl/math-files)
-
-.PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math
diff --git a/pl/math/asinhf_data.c b/pl/math/asinhf_data.c
deleted file mode 100644
index cd1ef16b3b6a..000000000000
--- a/pl/math/asinhf_data.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Coefficients for single-precision asinh(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya for
- these coeffs were generated. */
-const struct asinhf_data __asinhf_data
- = {.coeffs
- = {-0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, 0x1.3a81dcp-4f,
- 0x1.65bbaap-10f, -0x1.057f1p-4f, 0x1.6c1d46p-5f, -0x1.4cafe8p-7f}};
diff --git a/pl/math/atan_data.c b/pl/math/atan_data.c
deleted file mode 100644
index 91d0f61d2eaf..000000000000
--- a/pl/math/atan_data.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x).
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-const struct atan_poly_data __atan_poly_data = {
- .poly = {/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
- [2**-1022, 1.0]. See atan.sollya for details of how these were
- generated. */
- -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
- 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
- -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
- 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
- -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
- 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
- -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16}};
diff --git a/pl/math/atanf_data.c b/pl/math/atanf_data.c
deleted file mode 100644
index c4cba2378cea..000000000000
--- a/pl/math/atanf_data.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x).
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0].
- */
-const struct atanf_poly_data __atanf_poly_data = {
- .poly = {/* See atanf.sollya for details of how these were generated. */
- -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
- -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f}};
diff --git a/pl/math/exp_data.c b/pl/math/exp_data.c
deleted file mode 100644
index 2354be76cfab..000000000000
--- a/pl/math/exp_data.c
+++ /dev/null
@@ -1,1120 +0,0 @@
-/*
- * Shared data between exp, exp2 and pow.
- *
- * Copyright (c) 2018-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#define N (1 << EXP_TABLE_BITS)
-
-const struct exp_data __exp_data = {
-// N/ln2
-.invln2N = 0x1.71547652b82fep0 * N,
-// -ln2/N
-#if N == 64
-.negln2hiN = -0x1.62e42fefa0000p-7,
-.negln2loN = -0x1.cf79abc9e3b3ap-46,
-#elif N == 128
-.negln2hiN = -0x1.62e42fefa0000p-8,
-.negln2loN = -0x1.cf79abc9e3b3ap-47,
-#elif N == 256
-.negln2hiN = -0x1.62e42fefc0000p-9,
-.negln2loN = 0x1.c610ca86c3899p-45,
-#elif N == 512
-.negln2hiN = -0x1.62e42fef80000p-10,
-.negln2loN = -0x1.1cf79abc9e3b4p-45,
-#endif
-// Used for rounding when !TOINT_INTRINSICS
-#if EXP_USE_TOINT_NARROW
-.shift = 0x1800000000.8p0,
-#else
-.shift = 0x1.8p52,
-#endif
-// exp polynomial coefficients.
-.poly = {
-#if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
-// abs error: 1.5543*2^-60
-// ulp error: 0.529 (0.533 without fma)
-// if |x| < ln2/128+eps
-// abs error if |x| < ln2/64: 1.7157*2^-50
-0x1.fffffffffdbcdp-2,
-0x1.555555555444cp-3,
-0x1.555573c6a9f7dp-5,
-0x1.1111266d28935p-7,
-#elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
-// abs error: 1.6735*2^-64
-// ulp error: 0.518 (0.522 without fma)
-// if |x| < ln2/64
-0x1.5555555548f9ap-3,
-0x1.555555554bf5dp-5,
-0x1.11115b75f0f4dp-7,
-0x1.6c171a6b6303ep-10,
-#elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE
-// abs error: 1.555*2^-66
-// ulp error: 0.509 (0.511 without fma)
-// if |x| < ln2/256+eps
-// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
-// abs error if |x| < ln2/128: 1.7145*2^-56
-0x1.ffffffffffdbdp-2,
-0x1.555555555543cp-3,
-0x1.55555cf172b91p-5,
-0x1.1111167a4d017p-7,
-#elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
-// abs error: 1.5542*2^-60
-// ulp error: 0.521 (0.523 without fma)
-// if |x| < ln2/128
-0x1.fffffffffdbcep-2,
-0x1.55555555543c2p-3,
-0x1.555573c64f2e3p-5,
-0x1.111126b4eff73p-7,
-#elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE
-// abs error: 1.6861*2^-71
-// ulp error: 0.509 (0.511 without fma)
-// if |x| < ln2/128
-0x1.55555555548fdp-3,
-0x1.555555555658fp-5,
-0x1.111123a859bb6p-7,
-0x1.6c16ba6920cabp-10,
-#elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
-// abs error: 1.43*2^-58
-// ulp error: 0.549 (0.550 without fma)
-// if |x| < ln2/512
-0x1p0, // unused
-0x1.fffffffffffd4p-2,
-0x1.5555571d6ef9p-3,
-0x1.5555576a5adcep-5,
-#elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE
-// abs error: 1.5547*2^-66
-// ulp error: 0.505 (0.506 without fma)
-// if |x| < ln2/256
-0x1.ffffffffffdbdp-2,
-0x1.555555555543cp-3,
-0x1.55555cf16e1edp-5,
-0x1.1111167a4b553p-7,
-#elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE
-// abs error: 1.4300*2^-63
-// ulp error: 0.504
-// if |x| < ln2/1024
-// abs error if |x| < ln2/512: 1.0689*2^-55
-0x1p0, // unused
-0x1.ffffffffffffdp-2,
-0x1.555555c75bb6p-3,
-0x1.555555dec04a8p-5,
-#endif
-},
-.exp2_shift = 0x1.8p52 / N,
-// exp2 polynomial coefficients.
-.exp2_poly = {
-#if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE
-// abs error: 1.3054*2^-63
-// ulp error: 0.515
-// if |x| < 1/64
-0x1.62e42fefa39efp-1,
-0x1.ebfbdff82c58fp-3,
-0x1.c6b08d7045cf1p-5,
-0x1.3b2ab6fb8fd0ep-7,
-0x1.5d884afec48d7p-10,
-0x1.43097dc684ae1p-13,
-#elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE
-// abs error: 1.2195*2^-65
-// ulp error: 0.507 (0.511 without fma)
-// if |x| < 1/256
-// abs error if |x| < 1/128: 1.9941*2^-56
-0x1.62e42fefa39efp-1,
-0x1.ebfbdff82c424p-3,
-0x1.c6b08d70cf4b5p-5,
-0x1.3b2abd24650ccp-7,
-0x1.5d7e09b4e3a84p-10,
-#elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE
-// abs error: 1.2195*2^-65
-// ulp error: 0.504 (0.508 without fma)
-// if |x| < 1/256
-0x1.62e42fefa39efp-1,
-0x1.ebfbdff82c424p-3,
-0x1.c6b08d70cf4b5p-5,
-0x1.3b2abd24650ccp-7,
-0x1.5d7e09b4e3a84p-10,
-#elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE
-// abs error: 1.4411*2^-64
-// ulp error: 0.5024 (0.5063 without fma)
-// if |x| < 1/1024
-// abs error if |x| < 1/512: 1.9430*2^-56
-0x1.62e42fefa39ecp-1,
-0x1.ebfbdff82c58bp-3,
-0x1.c6b08e46de41fp-5,
-0x1.3b2ab786ee1dap-7,
-#endif
-},
-// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
-// tab[2*k] = asuint64(T[k])
-// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
-.tab = {
-#if N == 64
-0x0, 0x3ff0000000000000,
-0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
-0x3c8cd2523567f613, 0x3fefd9b0d3158574,
-0x3c60f74e61e6c861, 0x3fefc74518759bc8,
-0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
-0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
-0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
-0xbc91c923b9d5f416, 0x3fef829aaea92de0,
-0xbc801b15eaa59348, 0x3fef72b83c7d517b,
-0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
-0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
-0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
-0x3c968efde3a8a894, 0x3fef387a6e756238,
-0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
-0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
-0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
-0x3c834d754db0abb6, 0x3fef06fe0a31b715,
-0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
-0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
-0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
-0x3c859f48a72a4c6d, 0x3feedea64c123422,
-0xbc58a78f4817895b, 0x3feed60a21f72e2a,
-0x3c4363ed60c2ac11, 0x3feece086061892d,
-0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
-0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
-0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
-0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
-0x3c93350518fdd78e, 0x3feeaf4736b527da,
-0x3c9063e1e21c5409, 0x3feeab07dd485429,
-0x3c9432e62b64c035, 0x3feea76f15ad2148,
-0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
-0xbc93cedd78565858, 0x3feea23882552225,
-0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
-0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
-0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
-0xbc8619321e55e68a, 0x3fee9feb564267c9,
-0xbc7b32dcb94da51d, 0x3feea11473eb0187,
-0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
-0xbc9369b6f13b3734, 0x3feea589994cce13,
-0xbc94d450d872576e, 0x3feea8d99b4492ed,
-0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
-0x3c7bf68359f35f44, 0x3feeb1ae99157736,
-0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
-0xbc92434322f4f9aa, 0x3feebd829fde4e50,
-0x3c71affc2b91ce27, 0x3feec49182a3f090,
-0xbc87c50422622263, 0x3feecc667b5de565,
-0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
-0x3c8469846e735ab3, 0x3feede6b5579fdbf,
-0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
-0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
-0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
-0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
-0x3c736eae30af0cb3, 0x3fef199bdd85529c,
-0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
-0x3c676b2c6c921968, 0x3fef3720dcef9069,
-0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
-0x3c74a385a63d07a7, 0x3fef5818dcfba487,
-0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
-0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
-0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
-0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
-0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
-0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
-0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
-#elif N == 128
-0x0, 0x3ff0000000000000,
-0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
-0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
-0xbc905e7a108766d1, 0x3fefe315e86e7f85,
-0x3c8cd2523567f613, 0x3fefd9b0d3158574,
-0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
-0x3c60f74e61e6c861, 0x3fefc74518759bc8,
-0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
-0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
-0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
-0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
-0xbc6a033489906e0b, 0x3fef9b66affed31b,
-0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
-0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
-0xbc91c923b9d5f416, 0x3fef829aaea92de0,
-0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
-0xbc801b15eaa59348, 0x3fef72b83c7d517b,
-0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
-0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
-0xbc96d99c7611eb26, 0x3fef5be084045cd4,
-0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
-0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
-0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
-0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
-0x3c968efde3a8a894, 0x3fef387a6e756238,
-0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
-0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
-0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
-0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
-0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
-0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
-0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
-0x3c834d754db0abb6, 0x3fef06fe0a31b715,
-0x3c864201e2ac744c, 0x3fef0170fc4cd831,
-0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
-0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
-0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
-0xbc9907f81b512d8e, 0x3feeecae6d05d866,
-0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
-0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
-0x3c859f48a72a4c6d, 0x3feedea64c123422,
-0xbc9312607a28698a, 0x3feeda4504ac801c,
-0xbc58a78f4817895b, 0x3feed60a21f72e2a,
-0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
-0x3c4363ed60c2ac11, 0x3feece086061892d,
-0x3c9666093b0664ef, 0x3feeca41ed1d0057,
-0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
-0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
-0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
-0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
-0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
-0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
-0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
-0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
-0x3c93350518fdd78e, 0x3feeaf4736b527da,
-0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
-0x3c9063e1e21c5409, 0x3feeab07dd485429,
-0x3c34c7855019c6ea, 0x3feea9268a5946b7,
-0x3c9432e62b64c035, 0x3feea76f15ad2148,
-0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
-0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
-0xbc845378892be9ae, 0x3feea34634ccc320,
-0xbc93cedd78565858, 0x3feea23882552225,
-0x3c5710aa807e1964, 0x3feea155d44ca973,
-0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
-0xbc6a12ad8734b982, 0x3feea012750bdabf,
-0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
-0xbc80dc3d54e08851, 0x3fee9f7df9519484,
-0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
-0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
-0xbc8619321e55e68a, 0x3fee9feb564267c9,
-0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
-0xbc7b32dcb94da51d, 0x3feea11473eb0187,
-0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
-0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
-0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
-0xbc9369b6f13b3734, 0x3feea589994cce13,
-0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
-0xbc94d450d872576e, 0x3feea8d99b4492ed,
-0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
-0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
-0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
-0x3c7bf68359f35f44, 0x3feeb1ae99157736,
-0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
-0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
-0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
-0xbc92434322f4f9aa, 0x3feebd829fde4e50,
-0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
-0x3c71affc2b91ce27, 0x3feec49182a3f090,
-0x3c6dd235e10a73bb, 0x3feec86319e32323,
-0xbc87c50422622263, 0x3feecc667b5de565,
-0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
-0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
-0x3c90cc319cee31d2, 0x3feed99e1330b358,
-0x3c8469846e735ab3, 0x3feede6b5579fdbf,
-0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
-0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
-0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
-0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
-0xbc90a40e3da6f640, 0x3feef9728de5593a,
-0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
-0xbc91eee26b588a35, 0x3fef05b030a1064a,
-0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
-0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
-0x3c736eae30af0cb3, 0x3fef199bdd85529c,
-0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
-0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
-0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
-0x3c676b2c6c921968, 0x3fef3720dcef9069,
-0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
-0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
-0xbc900dae3875a949, 0x3fef4f87080d89f2,
-0x3c74a385a63d07a7, 0x3fef5818dcfba487,
-0xbc82919e2040220f, 0x3fef60e316c98398,
-0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
-0x3c843a59ac016b4b, 0x3fef7321f301b460,
-0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
-0xbc892ab93b470dc9, 0x3fef864614f5a129,
-0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
-0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
-0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
-0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
-0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
-0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
-0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
-0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
-0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
-0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
-#elif N == 256
-0x0, 0x3ff0000000000000,
-0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
-0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
-0xbc82985dd8521d32, 0x3feff168143b0281,
-0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
-0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
-0xbc905e7a108766d1, 0x3fefe315e86e7f85,
-0x3c845fad437fa426, 0x3fefde5f72f654b1,
-0x3c8cd2523567f613, 0x3fefd9b0d3158574,
-0xbc954529642b232f, 0x3fefd50a0e3c1f89,
-0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
-0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
-0x3c60f74e61e6c861, 0x3fefc74518759bc8,
-0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
-0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
-0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
-0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
-0x3c9407fb30d06420, 0x3fefb0f145e46c85,
-0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
-0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
-0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
-0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
-0xbc6a033489906e0b, 0x3fef9b66affed31b,
-0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
-0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
-0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
-0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
-0xbc65704e90c9f860, 0x3fef86a814f204ab,
-0xbc91c923b9d5f416, 0x3fef829aaea92de0,
-0xbc897cea57e46280, 0x3fef7e95934f312e,
-0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
-0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
-0xbc801b15eaa59348, 0x3fef72b83c7d517b,
-0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
-0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
-0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
-0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
-0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
-0xbc96d99c7611eb26, 0x3fef5be084045cd4,
-0x3c8cdc1873af2155, 0x3fef582f95281c6b,
-0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
-0xbc9493684653a131, 0x3fef50e75eb44027,
-0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
-0xbc98e2899077520a, 0x3fef49c18438ce4d,
-0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
-0x3c9120fcd4f59273, 0x3fef42be3578a819,
-0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
-0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
-0x3c968efde3a8a894, 0x3fef387a6e756238,
-0x3c877afbca90ef84, 0x3fef351ffb82140a,
-0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
-0x3c91512f082876ee, 0x3fef2e85711ece75,
-0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
-0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
-0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
-0xbc803297e78260bf, 0x3fef21ba7591bb70,
-0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
-0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
-0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
-0xbc91e75c40b4251e, 0x3fef157e39771b2f,
-0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
-0x3c98a911f1f7785a, 0x3fef0f961f641589,
-0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
-0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
-0x3c834d754db0abb6, 0x3fef06fe0a31b715,
-0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
-0x3c864201e2ac744c, 0x3fef0170fc4cd831,
-0xbc979517a03e2847, 0x3feefeb83ba8ea32,
-0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
-0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
-0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
-0xbc87430803972b34, 0x3feef431a2de883b,
-0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
-0xbc954de30ae02d94, 0x3feeef26231e754a,
-0xbc9907f81b512d8e, 0x3feeecae6d05d866,
-0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
-0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
-0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
-0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
-0x3c79c3bba5562a2f, 0x3feee0e544ede173,
-0x3c859f48a72a4c6d, 0x3feedea64c123422,
-0xbc85a71612e21658, 0x3feedc70df1c5175,
-0xbc9312607a28698a, 0x3feeda4504ac801c,
-0x3c86421f6f1d24d6, 0x3feed822c367a024,
-0xbc58a78f4817895b, 0x3feed60a21f72e2a,
-0xbc9348a6815fce65, 0x3feed3fb2709468a,
-0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
-0x3c835c43984d9871, 0x3feecffa3f84b9d4,
-0x3c4363ed60c2ac11, 0x3feece086061892d,
-0xbc632afc8d9473a0, 0x3feecc2042a7d232,
-0x3c9666093b0664ef, 0x3feeca41ed1d0057,
-0xbc95fc5e44de020e, 0x3feec86d668b3237,
-0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
-0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
-0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
-0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
-0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
-0x3c892ca3bf144e63, 0x3feebe41b817c114,
-0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
-0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
-0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
-0x3c73e34f67e67118, 0x3feeb8417f4531ee,
-0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
-0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
-0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
-0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
-0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
-0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
-0x3c93350518fdd78e, 0x3feeaf4736b527da,
-0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
-0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
-0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
-0x3c9063e1e21c5409, 0x3feeab07dd485429,
-0xbc943a3540d1898a, 0x3feeaa11fba87a03,
-0x3c34c7855019c6ea, 0x3feea9268a5946b7,
-0xbc951f58ddaa8090, 0x3feea84590998b93,
-0x3c9432e62b64c035, 0x3feea76f15ad2148,
-0xbc82e1648e50a17c, 0x3feea6a320dceb71,
-0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
-0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
-0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
-0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
-0xbc845378892be9ae, 0x3feea34634ccc320,
-0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
-0xbc93cedd78565858, 0x3feea23882552225,
-0xbc85c33fdf910406, 0x3feea1c1c70833f6,
-0x3c5710aa807e1964, 0x3feea155d44ca973,
-0x3c81079ab5789604, 0x3feea0f4b19e9538,
-0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
-0x3c727df161cd7778, 0x3feea052fa75173e,
-0xbc6a12ad8734b982, 0x3feea012750bdabf,
-0x3c93f9924a05b767, 0x3fee9fdcddd47645,
-0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
-0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
-0xbc80dc3d54e08851, 0x3fee9f7df9519484,
-0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
-0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
-0xbc88e67a9006c909, 0x3fee9f8286ead08a,
-0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
-0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
-0xbc8619321e55e68a, 0x3fee9feb564267c9,
-0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
-0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
-0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
-0xbc7b32dcb94da51d, 0x3feea11473eb0187,
-0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
-0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
-0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
-0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
-0xbc760a3629969871, 0x3feea3878491c491,
-0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
-0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
-0xbc9369b6f13b3734, 0x3feea589994cce13,
-0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
-0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
-0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
-0xbc94d450d872576e, 0x3feea8d99b4492ed,
-0x3c7c88549b958471, 0x3feea9cad931a436,
-0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
-0x3c931143962f7877, 0x3feeabd0a478580f,
-0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
-0x3c93e9e96f112479, 0x3feeae05bad61778,
-0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
-0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
-0x3c7bf68359f35f44, 0x3feeb1ae99157736,
-0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
-0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
-0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
-0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
-0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
-0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
-0xbc51669428996971, 0x3feebbdd9a7670b3,
-0xbc92434322f4f9aa, 0x3feebd829fde4e50,
-0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
-0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
-0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
-0x3c71affc2b91ce27, 0x3feec49182a3f090,
-0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
-0x3c6dd235e10a73bb, 0x3feec86319e32323,
-0xbc79740b58a20091, 0x3feeca5e8d07f29e,
-0xbc87c50422622263, 0x3feecc667b5de565,
-0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
-0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
-0xbc903d5cbe27874b, 0x3feed2c980460ad8,
-0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
-0x3c5986178980fce0, 0x3feed74a8af46052,
-0x3c90cc319cee31d2, 0x3feed99e1330b358,
-0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
-0x3c8469846e735ab3, 0x3feede6b5579fdbf,
-0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
-0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
-0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
-0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
-0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
-0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
-0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
-0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
-0xbc7274aedac8ff80, 0x3feef68415b749b1,
-0xbc90a40e3da6f640, 0x3feef9728de5593a,
-0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
-0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
-0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
-0xbc91eee26b588a35, 0x3fef05b030a1064a,
-0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
-0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
-0xbc302899507554e5, 0x3fef0f69c3f3a207,
-0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
-0xbc80dda2d4c0010c, 0x3fef16286141b33d,
-0x3c736eae30af0cb3, 0x3fef199bdd85529c,
-0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
-0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
-0x3c836909391181d3, 0x3fef244778fafb22,
-0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
-0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
-0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
-0xbc7ac28b7bef6621, 0x3fef33405751c4db,
-0x3c676b2c6c921968, 0x3fef3720dcef9069,
-0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
-0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
-0xbc8cc734592af7fc, 0x3fef43155b5bab74,
-0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
-0x3c87752a44f587e8, 0x3fef4b532b08c968,
-0xbc900dae3875a949, 0x3fef4f87080d89f2,
-0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
-0x3c74a385a63d07a7, 0x3fef5818dcfba487,
-0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
-0xbc82919e2040220f, 0x3fef60e316c98398,
-0x3c8c254d16117a68, 0x3fef655d71ff6075,
-0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
-0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
-0x3c843a59ac016b4b, 0x3fef7321f301b460,
-0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
-0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
-0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
-0xbc892ab93b470dc9, 0x3fef864614f5a129,
-0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
-0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
-0xbc776caa4c2ff1cf, 0x3fef953924676d76,
-0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
-0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
-0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
-0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
-0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
-0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
-0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
-0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
-0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
-0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
-0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
-0x3c901f3a75ee0efe, 0x3fefd632798844f8,
-0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
-0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
-0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
-0xbc699c7db2effc76, 0x3fefedba3692d514,
-0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
-0x3c64b458677f9840, 0x3feff9d96b2a23d9,
-#elif N == 512
-0x0, 0x3ff0000000000000,
-0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a,
-0xbc84e82fc61851ac, 0x3feffb1afa5abcbf,
-0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11,
-0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
-0x3c75c18e5ae0563a, 0x3feff3d1e77170b4,
-0xbc82985dd8521d32, 0x3feff168143b0281,
-0xbc705b1125cf49a5, 0x3fefef003103b10e,
-0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
-0x3c9f879abbff3f87, 0x3fefea363d42b027,
-0x3c651e617061bfbd, 0x3fefe7d42e11bbcc,
-0x3c9b14003824712a, 0x3fefe57411915a8a,
-0xbc905e7a108766d1, 0x3fefe315e86e7f85,
-0x3c61cbf0f38af658, 0x3fefe0b9b35659d8,
-0x3c845fad437fa426, 0x3fefde5f72f654b1,
-0xbc9a3316383dcbc5, 0x3fefdc0727fc1762,
-0x3c8cd2523567f613, 0x3fefd9b0d3158574,
-0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2,
-0xbc954529642b232f, 0x3fefd50a0e3c1f89,
-0xbc89b3236d111646, 0x3fefd2b99fa6407c,
-0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
-0xbc8cb191be99b1b0, 0x3fefce1ead925493,
-0x3c8293708ef5c32e, 0x3fefcbd42b72a836,
-0xbc9acb71e83765b7, 0x3fefc98ba42e7d30,
-0x3c60f74e61e6c861, 0x3fefc74518759bc8,
-0x3c5cd3e58b03697e, 0x3fefc50088f8093f,
-0xbc95b9280905b2a4, 0x3fefc2bdf66607e0,
-0xbc8bfb07d4755452, 0x3fefc07d61701716,
-0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
-0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715,
-0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919,
-0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52,
-0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
-0xbc85b9eb0402507b, 0x3fefb323d833d93f,
-0x3c9407fb30d06420, 0x3fefb0f145e46c85,
-0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53,
-0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
-0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba,
-0xbc9a5d04b3b9911b, 0x3fefa83b23395dec,
-0x3c9c8be44bf4cde8, 0x3fefa612a7b26300,
-0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
-0x3c820c5444c93c44, 0x3fefa1c7c55189c6,
-0xbc937a01f0739546, 0x3fef9fa55fdfa9c5,
-0xbc84c6baeb580d7a, 0x3fef9d8503328e6d,
-0xbc6a033489906e0b, 0x3fef9b66affed31b,
-0x3c8657aa1b0d9f83, 0x3fef994a66f951ce,
-0x3c8b8268b04ef0a5, 0x3fef973028d7233e,
-0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1,
-0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
-0xbc6b0b2789925e90, 0x3fef90edb6db2dc1,
-0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6,
-0xbc93aad17d197fae, 0x3fef8ccbae51a5c8,
-0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
-0xbc989c464a07ad70, 0x3fef88b1e264a0e9,
-0xbc65704e90c9f860, 0x3fef86a814f204ab,
-0xbc72c338fce197f4, 0x3fef84a058cbae1e,
-0xbc91c923b9d5f416, 0x3fef829aaea92de0,
-0xbc6dca724cea0eb6, 0x3fef809717425438,
-0xbc897cea57e46280, 0x3fef7e95934f312e,
-0x3c464770b955d34d, 0x3fef7c962388149e,
-0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
-0xbc962811c114424f, 0x3fef789d83606e12,
-0x3c56f01429e2b9d2, 0x3fef76a45471c3c2,
-0x3c8ec58e74904dd4, 0x3fef74ad3c92df73,
-0xbc801b15eaa59348, 0x3fef72b83c7d517b,
-0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89,
-0x3c6e653b2459034b, 0x3fef6ed48695bbc0,
-0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9,
-0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
-0x3c8bda920de0f6e2, 0x3fef690eba4df41f,
-0x3c92cc7ea345b7dc, 0x3fef672658375d2f,
-0xbc9a597f9a5ff71c, 0x3fef654013041dc2,
-0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
-0x3c50835b125aa573, 0x3fef6179e2363cf8,
-0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c,
-0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0,
-0xbc96d99c7611eb26, 0x3fef5be084045cd4,
-0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f,
-0x3c8cdc1873af2155, 0x3fef582f95281c6b,
-0xbc6817fd6a313e3e, 0x3fef565a51860746,
-0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
-0xbc96236af85fd26a, 0x3fef52b6358e15e8,
-0xbc9493684653a131, 0x3fef50e75eb44027,
-0x3c7795eb4523abe7, 0x3fef4f1aad999e82,
-0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
-0x3c8fe58b91b40095, 0x3fef4b87bf9cda38,
-0xbc98e2899077520a, 0x3fef49c18438ce4d,
-0x3c91ecaa860c614a, 0x3fef47fd7190241e,
-0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
-0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18,
-0x3c9120fcd4f59273, 0x3fef42be3578a819,
-0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9,
-0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
-0x3c87f1c7350e256d, 0x3fef3d9282fc1f27,
-0x3c89b788c188c9b8, 0x3fef3bdda27912d1,
-0x3c420dac6c124f4f, 0x3fef3a2af0b63bff,
-0x3c968efde3a8a894, 0x3fef387a6e756238,
-0xbc99501d09bc09fd, 0x3fef36cc1c78903a,
-0x3c877afbca90ef84, 0x3fef351ffb82140a,
-0x3c73baf864dc8675, 0x3fef33760c547f15,
-0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
-0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff,
-0x3c91512f082876ee, 0x3fef2e85711ece75,
-0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82,
-0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
-0xbc7548165d85ed32, 0x3fef29a8b16f0a30,
-0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29,
-0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98,
-0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
-0xbc93a255f697ecfe, 0x3fef234c0ea83f36,
-0xbc803297e78260bf, 0x3fef21ba7591bb70,
-0x3c8d2d19edc1e550, 0x3fef202b17779965,
-0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
-0xbc76b2173113dd8c, 0x3fef1d130f50d65c,
-0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13,
-0x3c811aa5f853590b, 0x3fef1a03fc675d1f,
-0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
-0x3c61d61a34c8aa02, 0x3fef16fde4f2e280,
-0xbc91e75c40b4251e, 0x3fef157e39771b2f,
-0xbc91f892bf6b286d, 0x3fef1400cf2f6c18,
-0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
-0x3c7590c65c20e680, 0x3fef110cc15d5346,
-0x3c98a911f1f7785a, 0x3fef0f961f641589,
-0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833,
-0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
-0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2,
-0xbc61e7c998db7dbb, 0x3fef09d24abd886b,
-0x3c7b3bf786a54a87, 0x3fef08670653dfe4,
-0x3c834d754db0abb6, 0x3fef06fe0a31b715,
-0x3c74bb6c41732885, 0x3fef05975721b004,
-0x3c85425c11faadf4, 0x3fef0432edeeb2fd,
-0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac,
-0x3c864201e2ac744c, 0x3fef0170fc4cd831,
-0xbc5451d60c6ac9eb, 0x3fef001375752b40,
-0xbc979517a03e2847, 0x3feefeb83ba8ea32,
-0x3c8787a210ceafd9, 0x3feefd5f4fb45e20,
-0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
-0xbc888d1e4629943d, 0x3feefab46484ebb4,
-0xbc800e2a46da4bee, 0x3feef96266e3fa2d,
-0xbc93369c544088b6, 0x3feef812ba4ea77d,
-0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
-0x3c85373ce4eb6dfb, 0x3feef57a577dd72b,
-0xbc87430803972b34, 0x3feef431a2de883b,
-0x3c83adec8265a67f, 0x3feef2eb428335b4,
-0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
-0xbc835388bcac6bc5, 0x3feef06581d3f669,
-0xbc954de30ae02d94, 0x3feeef26231e754a,
-0x3c727cdb4e4b6640, 0x3feeede91be9c811,
-0xbc9907f81b512d8e, 0x3feeecae6d05d866,
-0x3c86c2696a26af35, 0x3feeeb761742d808,
-0xbc94f2487e1c03ec, 0x3feeea401b7140ef,
-0x3c888f6ff06b979a, 0x3feee90c7a61d55b,
-0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
-0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea,
-0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4,
-0xbc76b8867f91c9d6, 0x3feee4559212ef89,
-0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
-0x3c94c9c0b5157fe6, 0x3feee20853c10f28,
-0x3c79c3bba5562a2f, 0x3feee0e544ede173,
-0xbc62455345b51c8e, 0x3feedfc4976d27fa,
-0x3c859f48a72a4c6d, 0x3feedea64c123422,
-0xbc93331de45477d0, 0x3feedd8a63b0a09b,
-0xbc85a71612e21658, 0x3feedc70df1c5175,
-0xbc95f84d39b39b16, 0x3feedb59bf29743f,
-0xbc9312607a28698a, 0x3feeda4504ac801c,
-0xbc72ba4dc7c4d562, 0x3feed932b07a35df,
-0x3c86421f6f1d24d6, 0x3feed822c367a024,
-0xbc844f25dc02691f, 0x3feed7153e4a136a,
-0xbc58a78f4817895b, 0x3feed60a21f72e2a,
-0xbc888d328eb9b501, 0x3feed5016f44d8f5,
-0xbc9348a6815fce65, 0x3feed3fb2709468a,
-0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1,
-0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
-0xbc615f0a2b9cd452, 0x3feed0f6d5817663,
-0x3c835c43984d9871, 0x3feecffa3f84b9d4,
-0xbc8c2e465a919e1d, 0x3feecf0018321a1a,
-0x3c4363ed60c2ac11, 0x3feece086061892d,
-0xbc865dfd02bd08f1, 0x3feecd1318eb43ec,
-0xbc632afc8d9473a0, 0x3feecc2042a7d232,
-0xbc8e68cec89b1762, 0x3feecb2fde7006f4,
-0x3c9666093b0664ef, 0x3feeca41ed1d0057,
-0xbc48ae858eb682ca, 0x3feec9566f8827d0,
-0xbc95fc5e44de020e, 0x3feec86d668b3237,
-0x3c5dd71277c0915f, 0x3feec786d3001fe5,
-0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
-0x3c92001325ecd7fb, 0x3feec5c10fa920a1,
-0xbc7ea0148327c42f, 0x3feec4e1e192aed2,
-0x3c65ace6e2870332, 0x3feec4052c5916c4,
-0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
-0xbc9595c55690ffaf, 0x3feec2532feaada6,
-0xbc7a843ad1a88022, 0x3feec17dea6db7d7,
-0xbc8b401ba9fb5199, 0x3feec0ab213d5283,
-0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
-0x3c6df82bf324cc57, 0x3feebf0d073537ca,
-0x3c892ca3bf144e63, 0x3feebe41b817c114,
-0x3c97cae38641c7bb, 0x3feebd78e8bb586b,
-0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
-0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a,
-0xbc902c99b04aa8b0, 0x3feebb2d81d8abff,
-0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0,
-0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
-0xbc80b582d74a55d9, 0x3feeb8f8b804f127,
-0x3c73e34f67e67118, 0x3feeb8417f4531ee,
-0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d,
-0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
-0xbc592dca38593e20, 0x3feeb62b00da3b14,
-0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef,
-0xbc85daca9994833e, 0x3feeb4d359dfd53d,
-0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
-0xbc980b4321bc6dae, 0x3feeb385df598d78,
-0x3c81bd2888075068, 0x3feeb2e2f4f6ad27,
-0xbc8390afec5241c5, 0x3feeb24298571b06,
-0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
-0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf,
-0xbc896be8ae89ef8f, 0x3feeb070dde910d2,
-0xbc910aa91ae9b67f, 0x3feeafdac1351819,
-0x3c93350518fdd78e, 0x3feeaf4736b527da,
-0x3c957e1b67462375, 0x3feeaeb63f4d854c,
-0xbc88e6ac90348602, 0x3feeae27dbe2c4cf,
-0x3c8124d5051552a7, 0x3feead9c0d59ca07,
-0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
-0xbc3ca103952ecf1f, 0x3feeac8c32824135,
-0xbc91af7f1365c3ac, 0x3feeac0827ff07cc,
-0x3c773345c02a4fd6, 0x3feeab86b5f43d92,
-0x3c9063e1e21c5409, 0x3feeab07dd485429,
-0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e,
-0xbc943a3540d1898a, 0x3feeaa11fba87a03,
-0xbc924f2cb4f81746, 0x3feea99af482fc8f,
-0x3c34c7855019c6ea, 0x3feea9268a5946b7,
-0xbc943592a0a9846b, 0x3feea8b4be135acc,
-0xbc951f58ddaa8090, 0x3feea84590998b93,
-0xbc956bc85d444f4f, 0x3feea7d902d47c65,
-0x3c9432e62b64c035, 0x3feea76f15ad2148,
-0x3c914d1e4218319f, 0x3feea707ca0cbf0f,
-0xbc82e1648e50a17c, 0x3feea6a320dceb71,
-0x3c971c93709313f4, 0x3feea6411b078d26,
-0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
-0x3c7f88303b60d222, 0x3feea584fd15612a,
-0x3c95f30eda98a575, 0x3feea52ae6cdf6f4,
-0x3c70125ca18d4b5b, 0x3feea4d3778bc944,
-0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
-0x3c9592ea73798b11, 0x3feea42c91c56acd,
-0x3c917ecda8a72159, 0x3feea3dd1d1929fd,
-0xbc9371d6d7d75739, 0x3feea390532205d8,
-0xbc845378892be9ae, 0x3feea34634ccc320,
-0xbc8ac05fd996f807, 0x3feea2fec30678b7,
-0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7,
-0xbc91f5067d03653a, 0x3feea277e8dcc390,
-0xbc93cedd78565858, 0x3feea23882552225,
-0x3c917339c86ce3ad, 0x3feea1fbcc140be7,
-0xbc85c33fdf910406, 0x3feea1c1c70833f6,
-0xbc77e66065ba2500, 0x3feea18a7420a036,
-0x3c5710aa807e1964, 0x3feea155d44ca973,
-0x3c964c827ee6b49a, 0x3feea123e87bfb7a,
-0x3c81079ab5789604, 0x3feea0f4b19e9538,
-0xbc928311a3c73480, 0x3feea0c830a4c8d4,
-0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
-0x3c882c79e185e981, 0x3feea077541ee718,
-0x3c727df161cd7778, 0x3feea052fa75173e,
-0xbc8b48cea80b043b, 0x3feea0315a736c75,
-0xbc6a12ad8734b982, 0x3feea012750bdabf,
-0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09,
-0x3c93f9924a05b767, 0x3fee9fdcddd47645,
-0x3c954835dd4b7548, 0x3fee9fc62dea2f8a,
-0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
-0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8,
-0xbc87557939a8b5ef, 0x3fee9f9298593ae5,
-0xbc8f652fde52775c, 0x3fee9f86e7ba9fef,
-0xbc80dc3d54e08851, 0x3fee9f7df9519484,
-0xbc7b0300defbcf98, 0x3fee9f77ce1303f6,
-0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87,
-0xbc89dab646035dc0, 0x3fee9f73c4eaa988,
-0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
-0xbc91f0c230588dde, 0x3fee9f7ad3ef9011,
-0xbc88e67a9006c909, 0x3fee9f8286ead08a,
-0x3c9106450507a28c, 0x3fee9f8d02d50b8f,
-0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
-0xbc9129729a10f3a0, 0x3fee9faa5953c849,
-0x3c86597566977ac8, 0x3fee9fbd35d7cbfd,
-0x3c781a70a5124f67, 0x3fee9fd2df29ce7c,
-0xbc8619321e55e68a, 0x3fee9feb564267c9,
-0x3c941626ea62646d, 0x3feea0069c1a861d,
-0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09,
-0xbc940b9f54365b7c, 0x3feea04597eeba8f,
-0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
-0x3c873455e0e826c1, 0x3feea08fda749e5d,
-0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6,
-0x3c94f006ad874e3e, 0x3feea0e56b7fcf03,
-0xbc7b32dcb94da51d, 0x3feea11473eb0187,
-0xbc8f6d693d0973bb, 0x3feea14652e958aa,
-0xbc92dad3519d7b5b, 0x3feea17b0976cfdb,
-0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec,
-0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
-0xbc88b25e045d207b, 0x3feea22a4456e7a3,
-0x3c87d51410fd15c2, 0x3feea26a62ff86f0,
-0xbc69cb3314060ca7, 0x3feea2ad5e2850ac,
-0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
-0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9,
-0xbc760a3629969871, 0x3feea3878491c491,
-0x3c94aa7212bfa73c, 0x3feea3d5fbab091f,
-0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
-0xbc81e688272a8a12, 0x3feea47b8f4abaa9,
-0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9,
-0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a,
-0xbc9369b6f13b3734, 0x3feea589994cce13,
-0x3c8a1e274eed4476, 0x3feea5e968443d9a,
-0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7,
-0x3c94a533a59324da, 0x3feea6b1bdadb46d,
-0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
-0x3c7a56d2760d087d, 0x3feea785b91e07f1,
-0xbc522cea4f3afa1e, 0x3feea7f4179f5b21,
-0x3c91682c1c6e8b05, 0x3feea86562ab00ec,
-0xbc94d450d872576e, 0x3feea8d99b4492ed,
-0x3c89ea99cf7a9591, 0x3feea950c27004c2,
-0x3c7c88549b958471, 0x3feea9cad931a436,
-0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957,
-0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
-0x3c909b176e05a9cd, 0x3feeab4ac52be8f7,
-0x3c931143962f7877, 0x3feeabd0a478580f,
-0x3c711607f1952c95, 0x3feeac597875c644,
-0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
-0x3c869608f0f86431, 0x3feead74029db01e,
-0x3c93e9e96f112479, 0x3feeae05bad61778,
-0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598,
-0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
-0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6,
-0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9,
-0x3c81c1701c359530, 0x3feeb10afc931857,
-0x3c7bf68359f35f44, 0x3feeb1ae99157736,
-0xbc8edb1bf6809287, 0x3feeb2553499284b,
-0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a,
-0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c,
-0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
-0xbc93fc025e1db9ce, 0x3feeb50dad829e70,
-0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2,
-0xbc8d737c7d71382e, 0x3feeb67bff148396,
-0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
-0x3c6ae88c43905293, 0x3feeb7f669e2802b,
-0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5,
-0xbc93d1f7661fe51b, 0x3feeb97cf65253d1,
-0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
-0x3c651b68797ffc1c, 0x3feebb0faccf9243,
-0xbc51669428996971, 0x3feebbdd9a7670b3,
-0x3c54579c5ceed70b, 0x3feebcae95cba768,
-0xbc92434322f4f9aa, 0x3feebd829fde4e50,
-0x3c87298413381667, 0x3feebe59b9bddb5b,
-0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2,
-0xbc905000be64e965, 0x3feec01121235681,
-0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
-0xbc89fb12e3454b73, 0x3feec1d4d47f2598,
-0xbc9294f304f166b6, 0x3feec2bb4d53fe0d,
-0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3,
-0x3c71affc2b91ce27, 0x3feec49182a3f090,
-0x3c90622b15810eea, 0x3feec581414380f2,
-0xbc8a1e58414c07d3, 0x3feec674194bb8d5,
-0x3be9a5ecc875d327, 0x3feec76a0bcfc15e,
-0x3c6dd235e10a73bb, 0x3feec86319e32323,
-0x3c88ea486a3350ef, 0x3feec95f4499c647,
-0xbc79740b58a20091, 0x3feeca5e8d07f29e,
-0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb,
-0xbc87c50422622263, 0x3feecc667b5de565,
-0x3c89c31f7e38028b, 0x3feecd6f23701b15,
-0x3c9165830a2b96c2, 0x3feece7aed8eb8bb,
-0xbc5fac13f4e005a3, 0x3feecf89dacfe68c,
-0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
-0x3c7d8aced7162e89, 0x3feed1b1231475f7,
-0xbc903d5cbe27874b, 0x3feed2c980460ad8,
-0xbc848f50cea7269f, 0x3feed3e504f696b1,
-0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
-0x3c821eb9a08a0542, 0x3feed625893523d4,
-0x3c5986178980fce0, 0x3feed74a8af46052,
-0xbc6133a953131cfd, 0x3feed872b8950a73,
-0x3c90cc319cee31d2, 0x3feed99e1330b358,
-0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca,
-0xbc89472975b1f2a5, 0x3feedbfe53c12e59,
-0xbc90260cf07cb311, 0x3feedd333beb0b7e,
-0x3c8469846e735ab3, 0x3feede6b5579fdbf,
-0x3c1bca400a7b939d, 0x3feedfa6a1897fd2,
-0x3c7d8157a34b7e7f, 0x3feee0e521356eba,
-0x3c9140bc34dfc19f, 0x3feee226d59a09ee,
-0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
-0xbc8c9b1da461ab87, 0x3feee4b3e100301e,
-0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774,
-0x3c8c115f23ebea8e, 0x3feee74dcca5a413,
-0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
-0xbc6dcab99f23f84e, 0x3feee9f4a17a4735,
-0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff,
-0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4,
-0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
-0x3c915b1397075f04, 0x3feeef692a8fa8cd,
-0x3c889c2ea41433c7, 0x3feef0ce6c9a8952,
-0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a,
-0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
-0xbc86a510f31e13e6, 0x3feef511c43bbd62,
-0xbc7274aedac8ff80, 0x3feef68415b749b1,
-0xbc92887ea88e7340, 0x3feef7f9ade433c6,
-0xbc90a40e3da6f640, 0x3feef9728de5593a,
-0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87,
-0x3c85c620ce76df06, 0x3feefc6e29f1c52a,
-0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6,
-0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
-0xbc8d1bf10460dba0, 0x3fef01004b3a7804,
-0xbc8fda52e1b51e41, 0x3fef028cf22749e4,
-0x3c8e5d80813dddfc, 0x3fef041ce8e77680,
-0xbc91eee26b588a35, 0x3fef05b030a1064a,
-0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7,
-0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f,
-0x3c7a77557fd62db3, 0x3fef0a7df9285775,
-0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
-0xbc651ba6128db749, 0x3fef0dc27e2cb5e5,
-0xbc302899507554e5, 0x3fef0f69c3f3a207,
-0xbc7c0ffefdc5e251, 0x3fef111462c95b60,
-0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
-0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30,
-0xbc80dda2d4c0010c, 0x3fef16286141b33d,
-0x3c923759b8aca76d, 0x3fef17e06ff301f4,
-0x3c736eae30af0cb3, 0x3fef199bdd85529c,
-0xbc895498a73dac7d, 0x3fef1b5aab23e61e,
-0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c,
-0x3c851de924583108, 0x3fef1ee26b34e065,
-0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
-0xbc8c5fe4051ba06c, 0x3fef2277b9881650,
-0x3c836909391181d3, 0x3fef244778fafb22,
-0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad,
-0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
-0xbc7af5c67c4e8235, 0x3fef29cb269e601f,
-0xbc811cd7dbdf9547, 0x3fef2ba88988c933,
-0xbc8304ef0045d575, 0x3fef2d89584661a1,
-0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
-0x3c8725f94f910375, 0x3fef31553dfa8313,
-0xbc7ac28b7bef6621, 0x3fef33405751c4db,
-0x3c7b53e99f9191e8, 0x3fef352ee13da7cb,
-0x3c676b2c6c921968, 0x3fef3720dcef9069,
-0xbc810a79e6d7e2b8, 0x3fef39164b994d23,
-0xbc7030587207b9e1, 0x3fef3b0f2e6d1675,
-0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f,
-0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
-0x3c549eeef9ec910c, 0x3fef410e9be12cb9,
-0xbc8cc734592af7fc, 0x3fef43155b5bab74,
-0xbc8335827ffb9dce, 0x3fef451f95018d17,
-0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
-0x3c645563980ef762, 0x3fef493e7ba2c38c,
-0x3c87752a44f587e8, 0x3fef4b532b08c968,
-0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c,
-0xbc900dae3875a949, 0x3fef4f87080d89f2,
-0xbc8aab80ceab2b4a, 0x3fef51a638197a3c,
-0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6,
-0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f,
-0x3c74a385a63d07a7, 0x3fef5818dcfba487,
-0x3c83c119f18464c5, 0x3fef5a461eec14be,
-0x3c5159d9d908a96e, 0x3fef5c76e862e6d3,
-0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b,
-0xbc82919e2040220f, 0x3fef60e316c98398,
-0xbc72550d76be719a, 0x3fef631e7e2d479d,
-0x3c8c254d16117a68, 0x3fef655d71ff6075,
-0xbc82090274667d12, 0x3fef679ff37adb4a,
-0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
-0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd,
-0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315,
-0x3c890de9296f4cd1, 0x3fef70cd9ab294e4,
-0x3c843a59ac016b4b, 0x3fef7321f301b460,
-0x3c832ff9978b34bc, 0x3fef7579e065807d,
-0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658,
-0xbc7303b63dda1980, 0x3fef7a347f63c159,
-0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
-0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1,
-0xbc63e8e3eab2cbb4, 0x3fef81676b197d17,
-0x3c768d9144ae12fc, 0x3fef83d4f11f8220,
-0xbc892ab93b470dc9, 0x3fef864614f5a129,
-0x3c853687f542403b, 0x3fef88bad7dcee90,
-0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12,
-0xbc736ed2de40b407, 0x3fef8daf3fe592e8,
-0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
-0xbc614ef56c770f3b, 0x3fef92b2334ac7ee,
-0xbc776caa4c2ff1cf, 0x3fef953924676d76,
-0x3c8df7d1353d8e88, 0x3fef97c3bc24e350,
-0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
-0xbc850bed64091b8a, 0x3fef9ce3e4933c7e,
-0xbc81d5fc525d9940, 0x3fef9f7977cdb740,
-0x3c89d852381c317f, 0x3fefa212b6bc3181,
-0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
-0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5,
-0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e,
-0xbc5a1f25ce94cae7, 0x3fefac9c80faa594,
-0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
-0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2,
-0x3c8269947c2bed4a, 0x3fefb4aaa2188510,
-0x3c737e8ae802b851, 0x3fefb7616ca06dd6,
-0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
-0x3c875119560e34af, 0x3fefbcda28a52e59,
-0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a,
-0xbc7431c3840929c6, 0x3fefc261cbdf5be7,
-0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
-0xbc8cb472d2e86b99, 0x3fefc7f860a70c22,
-0xbc69fa74878ba7c7, 0x3fefcac948dd7274,
-0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac,
-0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
-0x3c8eef18336b62e3, 0x3fefd35288633625,
-0x3c901f3a75ee0efe, 0x3fefd632798844f8,
-0x3c80d23f87b50a2a, 0x3fefd916302bd526,
-0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
-0x3c8302dee657c8e6, 0x3fefdee8f32a4b45,
-0xbc516a9ce6ed84fa, 0x3fefe1d802243c89,
-0xbc7b0caa080df170, 0x3fefe4cadbdac61d,
-0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
-0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54,
-0xbc699c7db2effc76, 0x3fefedba3692d514,
-0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad,
-0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
-0x3c8e70b094fa075a, 0x3feff6cbe15f6314,
-0x3c64b458677f9840, 0x3feff9d96b2a23d9,
-0xbc72ec9a3e5d680a, 0x3feffceaca4391b6,
-#endif
-},
-};
diff --git a/pl/math/expf.c b/pl/math/expf.c
deleted file mode 100644
index cd3cfa925c64..000000000000
--- a/pl/math/expf.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Single-precision e^x function.
- *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include <math.h>
-#include <stdint.h>
-#include "math_config.h"
-
-/*
-EXPF_TABLE_BITS = 5
-EXPF_POLY_ORDER = 3
-
-ULP error: 0.502 (nearest rounding.)
-Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
-Wrong count: 170635 (all nearest rounding wrong results with fma.)
-Non-nearest ULP error: 1 (rounded ULP error)
-*/
-
-#define N (1 << EXPF_TABLE_BITS)
-#define InvLn2N __expf_data.invln2_scaled
-#define T __expf_data.tab
-#define C __expf_data.poly_scaled
-
-static inline uint32_t
-top12 (float x)
-{
- return asuint (x) >> 20;
-}
-
-float
-optr_aor_exp_f32 (float x)
-{
- uint32_t abstop;
- uint64_t ki, t;
- /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */
- double_t kd, xd, z, r, r2, y, s;
-
- xd = (double_t) x;
- abstop = top12 (x) & 0x7ff;
- if (unlikely (abstop >= top12 (88.0f)))
- {
- /* |x| >= 88 or x is nan. */
- if (asuint (x) == asuint (-INFINITY))
- return 0.0f;
- if (abstop >= top12 (INFINITY))
- return x + x;
- if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
- return __math_oflowf (0);
- if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
- return __math_uflowf (0);
- }
-
- /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */
- z = InvLn2N * xd;
-
- /* Round and convert z to int, the result is in [-150*N, 128*N] and
- ideally nearest int is used, otherwise the magnitude of r can be
- bigger which gives larger approximation error. */
- kd = round (z);
- ki = lround (z);
- r = z - kd;
-
- /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
- t = T[ki % N];
- t += ki << (52 - EXPF_TABLE_BITS);
- s = asdouble (t);
- z = C[0] * r + C[1];
- r2 = r * r;
- y = C[2] * r + 1;
- y = z * r2 + y;
- y = y * s;
- return eval_as_float (y);
-}
diff --git a/pl/math/expm1_data.c b/pl/math/expm1_data.c
deleted file mode 100644
index ff7426b90135..000000000000
--- a/pl/math/expm1_data.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Coefficients for double-precision e^x - 1 function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Generated using fpminimax, see tools/expm1.sollya for details. */
-const double __expm1_poly[] = {0x1p-1,
- 0x1.5555555555559p-3,
- 0x1.555555555554bp-5,
- 0x1.111111110f663p-7,
- 0x1.6c16c16c1b5f3p-10,
- 0x1.a01a01affa35dp-13,
- 0x1.a01a018b4ecbbp-16,
- 0x1.71ddf82db5bb4p-19,
- 0x1.27e517fc0d54bp-22,
- 0x1.af5eedae67435p-26,
- 0x1.1f143d060a28ap-29};
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
deleted file mode 100644
index f886e7f8c07a..000000000000
--- a/pl/math/include/mathlib.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Public API.
- *
- * Copyright (c) 2015-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _MATHLIB_H
-#define _MATHLIB_H
-
-float acosf (float);
-float acoshf (float);
-float asinf (float);
-float asinhf (float);
-float atan2f (float, float);
-float atanf (float);
-float atanhf (float);
-float cbrtf (float);
-float coshf (float);
-float cospif (float);
-float erfcf (float);
-float erff (float);
-float erfinvf (float);
-float exp10f (float);
-float expm1f (float);
-float log10f (float);
-float log1pf (float);
-float sinhf (float);
-float sinpif (float);
-float tanf (float);
-float tanhf (float);
-
-double acos (double);
-double acosh (double);
-double asin (double);
-double asinh (double);
-double atan (double);
-double atan2 (double, double);
-double atanh (double);
-double cbrt (double);
-double cosh (double);
-double cospi (double);
-double erfc (double);
-double erfinv (double);
-double exp10 (double);
-double expm1 (double);
-double log10 (double);
-double log1p (double);
-double sinh (double);
-double sinpi (double);
-double tanh (double);
-
-long double cospil (long double);
-long double erfinvl (long double);
-long double exp10l (long double);
-long double sinpil (long double);
-
-#if __aarch64__
-# if __GNUC__ >= 5
-typedef __Float32x4_t __f32x4_t;
-typedef __Float64x2_t __f64x2_t;
-# elif __clang_major__ * 100 + __clang_minor__ >= 305
-typedef __attribute__ ((__neon_vector_type__ (4))) float __f32x4_t;
-typedef __attribute__ ((__neon_vector_type__ (2))) double __f64x2_t;
-# else
-# error Unsupported compiler
-# endif
-
-# if __GNUC__ >= 9 || __clang_major__ >= 8
-# define __vpcs __attribute__ ((__aarch64_vector_pcs__))
-
-typedef struct __f32x4x2_t
-{
- __f32x4_t val[2];
-} __f32x4x2_t;
-
-typedef struct __f64x2x2_t
-{
- __f64x2_t val[2];
-} __f64x2x2_t;
-
-/* Vector functions following the vector PCS using ABI names. */
-__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
-__vpcs __f32x4x2_t _ZGVnN4v_cexpif (__f32x4_t);
-__vpcs __f64x2x2_t _ZGVnN2v_cexpi (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_cospif (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_cospi (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_erfinvf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_erfinv (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
-__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
-__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
-__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
-__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
-__vpcs void _ZGVnN4vl4l4_sincosf (__f32x4_t, __f32x4_t *, __f32x4_t *);
-__vpcs void _ZGVnN2vl8l8_sincos (__f64x2_t, __f64x2_t *, __f64x2_t *);
-
-# endif
-
-# if WANT_SVE_MATH
-# include <arm_sve.h>
-svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
-svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
-svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t);
-svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t);
-svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
-svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
-svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
-svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t);
-svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t);
-svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t);
-svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
-svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
-svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
-svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t);
-void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t);
-void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t);
-# endif
-
-#endif
-
-#endif
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
deleted file mode 100644
index 3a3407e337b8..000000000000
--- a/pl/math/include/pl_test.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * PL macros to aid testing. This version of this file is used for building the
- * routine, not the tests. Separate definitions are found in test/pl_test.h
- * which emit test parameters.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
- */
-
-/* Emit max ULP threshold - silenced for building the routine. */
-#define PL_TEST_ULP(f, l)
-
-/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
- exceptions. e allows declaration to be emitted conditionally upon certain
- build flags - defer expansion by one pass to allow those flags to be expanded
- properly. */
-#define PL_TEST_EXPECT_FENV(f, e)
-#define PL_TEST_EXPECT_FENV_ALWAYS(f)
-
-#define PL_TEST_INTERVAL(f, lo, hi, n)
-#define PL_TEST_SYM_INTERVAL(f, lo, hi, n)
-#define PL_TEST_INTERVAL_C(f, lo, hi, n, c)
-#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c)
-#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n)
diff --git a/pl/math/log.c b/pl/math/log.c
deleted file mode 100644
index 40b0441d981d..000000000000
--- a/pl/math/log.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Double-precision log(x) function.
- *
- * Copyright (c) 2018-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include <float.h>
-#include <math.h>
-#include <stdint.h>
-#include "math_config.h"
-
-#define T __log_data.tab
-#define T2 __log_data.tab2
-#define B __log_data.poly1
-#define A __log_data.poly
-#define Ln2hi __log_data.ln2hi
-#define Ln2lo __log_data.ln2lo
-#define N (1 << LOG_TABLE_BITS)
-#define OFF 0x3fe6000000000000
-
-/* Top 16 bits of a double. */
-static inline uint32_t
-top16 (double x)
-{
- return asuint64 (x) >> 48;
-}
-
-double
-optr_aor_log_f64 (double x)
-{
- /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */
- double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
- uint64_t ix, iz, tmp;
- uint32_t top;
- int k, i;
-
- ix = asuint64 (x);
- top = top16 (x);
-
-#if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11
-#define LO asuint64 (1.0 - 0x1p-5)
-#define HI asuint64 (1.0 + 0x1.1p-5)
-#elif LOG_POLY1_ORDER == 12
-#define LO asuint64 (1.0 - 0x1p-4)
-#define HI asuint64 (1.0 + 0x1.09p-4)
-#endif
- if (unlikely (ix - LO < HI - LO))
- {
- /* Handle close to 1.0 inputs separately. */
- /* Fix sign of zero with downward rounding when x==1. */
- if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0)))
- return 0;
- r = x - 1.0;
- r2 = r * r;
- r3 = r * r2;
-#if LOG_POLY1_ORDER == 10
- /* Worst-case error is around 0.516 ULP. */
- y = r3
- * (B[1] + r * B[2] + r2 * B[3]
- + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8])));
- w = B[0] * r2; /* B[0] == -0.5. */
- hi = r + w;
- y += r - hi + w;
- y += hi;
-#elif LOG_POLY1_ORDER == 11
- /* Worst-case error is around 0.516 ULP. */
- y = r3
- * (B[1] + r * B[2]
- + r2
- * (B[3] + r * B[4] + r2 * B[5]
- + r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9])));
- w = B[0] * r2; /* B[0] == -0.5. */
- hi = r + w;
- y += r - hi + w;
- y += hi;
-#elif LOG_POLY1_ORDER == 12
- y = r3
- * (B[1] + r * B[2] + r2 * B[3]
- + r3
- * (B[4] + r * B[5] + r2 * B[6]
- + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
-#if N <= 64
- /* Worst-case error is around 0.532 ULP. */
- w = B[0] * r2; /* B[0] == -0.5. */
- hi = r + w;
- y += r - hi + w;
- y += hi;
-#else
- /* Worst-case error is around 0.507 ULP. */
- w = r * 0x1p27;
- double_t rhi = r + w - w;
- double_t rlo = r - rhi;
- w = rhi * rhi * B[0]; /* B[0] == -0.5. */
- hi = r + w;
- lo = r - hi + w;
- lo += B[0] * rlo * (rhi + r);
- y += lo;
- y += hi;
-#endif
-#endif
- return eval_as_double (y);
- }
- if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010))
- {
- /* x < 0x1p-1022 or inf or nan. */
- if (ix * 2 == 0)
- return __math_divzero (1);
- if (ix == asuint64 (INFINITY)) /* log(inf) == inf. */
- return x;
- if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
- return __math_invalid (x);
- /* x is subnormal, normalize it. */
- ix = asuint64 (x * 0x1p52);
- ix -= 52ULL << 52;
- }
-
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- tmp = ix - OFF;
- i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
- k = (int64_t) tmp >> 52; /* arithmetic shift */
- iz = ix - (tmp & 0xfffULL << 52);
- invc = T[i].invc;
- logc = T[i].logc;
- z = asdouble (iz);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- /* r ~= z/c - 1, |r| < 1/(2*N). */
-#if HAVE_FAST_FMA
- /* rounding error: 0x1p-55/N. */
- r = fma (z, invc, -1.0);
-#else
- /* rounding error: 0x1p-55/N + 0x1p-66. */
- r = (z - T2[i].chi - T2[i].clo) * invc;
-#endif
- kd = (double_t) k;
-
- /* hi + lo = r + log(c) + k*Ln2. */
- w = kd * Ln2hi + logc;
- hi = w + r;
- lo = w - hi + r + kd * Ln2lo;
-
- /* log(x) = lo + (log1p(r) - r) + hi. */
- r2 = r * r; /* rounding error: 0x1p-54/N^2. */
- /* Worst case error if |y| > 0x1p-5:
- 0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
- Worst case error if |y| > 0x1p-4:
- 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma). */
-#if LOG_POLY_ORDER == 6
- y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
-#elif LOG_POLY_ORDER == 7
- y = lo
- + r2
- * (A[0] + r * A[1] + r2 * (A[2] + r * A[3])
- + r2 * r2 * (A[4] + r * A[5]))
- + hi;
-#endif
- return eval_as_double (y);
-}
diff --git a/pl/math/log1p_data.c b/pl/math/log1p_data.c
deleted file mode 100644
index 6168a0c9a214..000000000000
--- a/pl/math/log1p_data.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Data used in double-precision log(1+x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Polynomial coefficients generated using Remez algorithm, see
- log1p.sollya for details. */
-const struct log1p_data __log1p_data = {
- .coeffs = {-0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
- 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
- -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
- 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
- -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
- 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
- -0x1.cfa7385bdb37ep-6}};
diff --git a/pl/math/log_data.c b/pl/math/log_data.c
deleted file mode 100644
index 34715e5036a3..000000000000
--- a/pl/math/log_data.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Data for log.
- *
- * Copyright (c) 2018-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#define N (1 << LOG_TABLE_BITS)
-
-const struct log_data __log_data = {
-.ln2hi = 0x1.62e42fefa3800p-1,
-.ln2lo = 0x1.ef35793c76730p-45,
-.poly1 = {
-#if LOG_POLY1_ORDER == 10
-// relative error: 0x1.32eccc6p-62
-// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
--0x1p-1,
-0x1.55555555554e5p-2,
--0x1.0000000000af2p-2,
-0x1.9999999bbe436p-3,
--0x1.55555537f9cdep-3,
-0x1.24922fc8127cfp-3,
--0x1.0000b7d6bb612p-3,
-0x1.c806ee1ddbcafp-4,
--0x1.972335a9c2d6ep-4,
-#elif LOG_POLY1_ORDER == 11
-// relative error: 0x1.52c8b708p-68
-// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval)
--0x1p-1,
-0x1.5555555555555p-2,
--0x1.ffffffffffea9p-3,
-0x1.999999999c4d4p-3,
--0x1.55555557f5541p-3,
-0x1.249248fbe33e4p-3,
--0x1.ffffc9a3c825bp-4,
-0x1.c71e1f204435dp-4,
--0x1.9a7f26377d06ep-4,
-0x1.71c30cf8f7364p-4,
-#elif LOG_POLY1_ORDER == 12
-// relative error: 0x1.c04d76cp-63
-// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval)
--0x1p-1,
-0x1.5555555555577p-2,
--0x1.ffffffffffdcbp-3,
-0x1.999999995dd0cp-3,
--0x1.55555556745a7p-3,
-0x1.24924a344de3p-3,
--0x1.fffffa4423d65p-4,
-0x1.c7184282ad6cap-4,
--0x1.999eb43b068ffp-4,
-0x1.78182f7afd085p-4,
--0x1.5521375d145cdp-4,
-#endif
-},
-.poly = {
-#if N == 64 && LOG_POLY_ORDER == 7
-// relative error: 0x1.906eb8ap-58
-// abs error: 0x1.d2cad5a8p-67
-// in -0x1.fp-8 0x1.fp-8
--0x1.0000000000027p-1,
-0x1.555555555556ap-2,
--0x1.fffffff0440bap-3,
-0x1.99999991906c3p-3,
--0x1.555c8d7e8201ep-3,
-0x1.24978c59151fap-3,
-#elif N == 128 && LOG_POLY_ORDER == 6
-// relative error: 0x1.926199e8p-56
-// abs error: 0x1.882ff33p-65
-// in -0x1.fp-9 0x1.fp-9
--0x1.0000000000001p-1,
-0x1.555555551305bp-2,
--0x1.fffffffeb459p-3,
-0x1.999b324f10111p-3,
--0x1.55575e506c89fp-3,
-#elif N == 128 && LOG_POLY_ORDER == 7
-// relative error: 0x1.649fc4bp-64
-// abs error: 0x1.c3b5769p-74
-// in -0x1.fp-9 0x1.fp-9
--0x1.0000000000001p-1,
-0x1.5555555555556p-2,
--0x1.fffffffea1a8p-3,
-0x1.99999998e9139p-3,
--0x1.555776801b968p-3,
-0x1.2493c29331a5cp-3,
-#endif
-},
-/* Algorithm:
-
- x = 2^k z
- log(x) = k ln2 + log(c) + log(z/c)
- log(z/c) = poly(z/c - 1)
-
-where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
-into the ith one, then table entries are computed as
-
- tab[i].invc = 1/c
- tab[i].logc = (double)log(c)
- tab2[i].chi = (double)c
- tab2[i].clo = (double)(c - (double)c)
-
-where c is near the center of the subinterval and is chosen by trying +-2^29
-floating point invc candidates around 1/center and selecting one for which
-
- 1) the rounding error in 0x1.8p9 + logc is 0,
- 2) the rounding error in z - chi - clo is < 0x1p-66 and
- 3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
-
-Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
-2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
-a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
-that logc + poly(z/c - 1) has small error, however near x == 1 when
-|log(x)| < 0x1p-4, this is not enough so that is special cased. */
-.tab = {
-#if N == 64
-{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2},
-{0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2},
-{0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2},
-{0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2},
-{0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2},
-{0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2},
-{0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2},
-{0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2},
-{0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2},
-{0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2},
-{0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2},
-{0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2},
-{0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3},
-{0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3},
-{0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3},
-{0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3},
-{0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3},
-{0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3},
-{0x1.33ae463091760p+0, -0x1.7898db878d000p-3},
-{0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3},
-{0x1.2e025c9203c89p+0, -0x1.527e620845000p-3},
-{0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3},
-{0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3},
-{0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3},
-{0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3},
-{0x1.20b4703174157p+0, -0x1.ec738fee40000p-4},
-{0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4},
-{0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4},
-{0x1.194538e960658p+0, -0x1.8197efba9a000p-4},
-{0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4},
-{0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4},
-{0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4},
-{0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5},
-{0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5},
-{0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5},
-{0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5},
-{0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6},
-{0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6},
-{0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7},
-{0x1.01010152cf066p+0, -0x1.0080a711c0000p-8},
-{0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8},
-{0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6},
-{0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5},
-{0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5},
-{0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4},
-{0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4},
-{0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4},
-{0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4},
-{0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4},
-{0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3},
-{0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3},
-{0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3},
-{0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3},
-{0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3},
-{0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3},
-{0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3},
-{0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3},
-{0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3},
-{0x1.8d3018b58699ap-1, 0x1.040259974e000p-2},
-{0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2},
-{0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2},
-{0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2},
-{0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2},
-{0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2},
-#elif N == 128
-{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
-{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
-{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
-{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
-{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
-{0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
-{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
-{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
-{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
-{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
-{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
-{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
-{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
-{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
-{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
-{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
-{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
-{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
-{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
-{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
-{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
-{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
-{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
-{0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
-{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
-{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
-{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
-{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
-{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
-{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
-{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
-{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
-{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
-{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
-{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
-{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
-{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
-{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
-{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
-{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
-{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
-{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
-{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
-{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
-{0x1.293726014b530p+0, -0x1.31b996b490000p-3},
-{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
-{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
-{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
-{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
-{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
-{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
-{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
-{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
-{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
-{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
-{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
-{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
-{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
-{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
-{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
-{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
-{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
-{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
-{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
-{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
-{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
-{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
-{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
-{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
-{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
-{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
-{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
-{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
-{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
-{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
-{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
-{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
-{0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
-{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
-{0x1.008040614b195p+0, -0x1.0040979240000p-9},
-{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
-{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
-{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
-{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
-{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
-{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
-{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
-{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
-{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
-{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
-{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
-{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
-{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
-{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
-{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
-{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
-{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
-{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
-{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
-{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
-{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
-{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
-{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
-{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
-{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
-{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
-{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
-{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
-{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
-{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
-{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
-{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
-{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
-{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
-{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
-{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
-{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
-{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
-{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
-{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
-{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
-{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
-{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
-{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
-{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
-{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
-{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
-{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
-#endif
-},
-#if !HAVE_FAST_FMA
-.tab2 = {
-#if N == 64
-{0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56},
-{0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55},
-{0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55},
-{0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56},
-{0x1.720000b37216ep-1, 0x1.802bc8d437043p-55},
-{0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57},
-{0x1.7a0000628daep-1, -0x1.e00434b49313dp-56},
-{0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56},
-{0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57},
-{0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56},
-{0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57},
-{0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55},
-{0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55},
-{0x1.960000135d8eap-1, -0x1.f832268dc3095p-55},
-{0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56},
-{0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58},
-{0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55},
-{0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55},
-{0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57},
-{0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56},
-{0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57},
-{0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57},
-{0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56},
-{0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55},
-{0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58},
-{0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55},
-{0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58},
-{0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59},
-{0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59},
-{0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58},
-{0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55},
-{0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55},
-{0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55},
-{0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60},
-{0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55},
-{0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55},
-{0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56},
-{0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57},
-{0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58},
-{0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58},
-{0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54},
-{0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55},
-{0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54},
-{0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54},
-{0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55},
-{0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57},
-{0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54},
-{0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55},
-{0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54},
-{0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55},
-{0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56},
-{0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54},
-{0x1.320000324c55bp+0, 0x1.f81983997354fp-54},
-{0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54},
-{0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54},
-{0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56},
-{0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54},
-{0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55},
-{0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55},
-{0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56},
-{0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54},
-{0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55},
-{0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55},
-{0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54},
-#elif N == 128
-{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
-{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
-{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
-{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
-{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
-{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
-{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
-{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
-{0x1.710000e86978p-1, 0x1.bff6671097952p-56},
-{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
-{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
-{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
-{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
-{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
-{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
-{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
-{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
-{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
-{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
-{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
-{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
-{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
-{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
-{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
-{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
-{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
-{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
-{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
-{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
-{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
-{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
-{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
-{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
-{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
-{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
-{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
-{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
-{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
-{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
-{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
-{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
-{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
-{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
-{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
-{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
-{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
-{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
-{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
-{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
-{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
-{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
-{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
-{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
-{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
-{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
-{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
-{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
-{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
-{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
-{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
-{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
-{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
-{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
-{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
-{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
-{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
-{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
-{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
-{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
-{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
-{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
-{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
-{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
-{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
-{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
-{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
-{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
-{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
-{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
-{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
-{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
-{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
-{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
-{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
-{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
-{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
-{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
-{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
-{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
-{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
-{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
-{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
-{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
-{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
-{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
-{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
-{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
-{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
-{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
-{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
-{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
-{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
-{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
-{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
-{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
-{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
-{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
-{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
-{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
-{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
-{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
-{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
-{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
-{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
-{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
-{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
-{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
-{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
-{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
-{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
-{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
-{0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
-{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
-{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
-{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
-{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
-{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
-{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
-#endif
-},
-#endif /* !HAVE_FAST_FMA */
-};
diff --git a/pl/math/logf.c b/pl/math/logf.c
deleted file mode 100644
index 17a74ed6d28f..000000000000
--- a/pl/math/logf.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Single-precision log function.
- *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include <math.h>
-#include <stdint.h>
-#include "math_config.h"
-
-/*
-LOGF_TABLE_BITS = 4
-LOGF_POLY_ORDER = 4
-
-ULP error: 0.818 (nearest rounding.)
-Relative error: 1.957 * 2^-26 (before rounding.)
-*/
-
-#define T __logf_data.tab
-#define A __logf_data.poly
-#define Ln2 __logf_data.ln2
-#define N (1 << LOGF_TABLE_BITS)
-#define OFF 0x3f330000
-
-float
-optr_aor_log_f32 (float x)
-{
- /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */
- double_t z, r, r2, y, y0, invc, logc;
- uint32_t ix, iz, tmp;
- int k, i;
-
- ix = asuint (x);
-#if WANT_ROUNDING
- /* Fix sign of zero with downward rounding when x==1. */
- if (unlikely (ix == 0x3f800000))
- return 0;
-#endif
- if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000))
- {
- /* x < 0x1p-126 or inf or nan. */
- if (ix * 2 == 0)
- return __math_divzerof (1);
- if (ix == 0x7f800000) /* log(inf) == inf. */
- return x;
- if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
- return __math_invalidf (x);
- /* x is subnormal, normalize it. */
- ix = asuint (x * 0x1p23f);
- ix -= 23 << 23;
- }
-
- /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- tmp = ix - OFF;
- i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
- k = (int32_t) tmp >> 23; /* arithmetic shift */
- iz = ix - (tmp & 0x1ff << 23);
- invc = T[i].invc;
- logc = T[i].logc;
- z = (double_t) asfloat (iz);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
- r = z * invc - 1;
- y0 = logc + (double_t) k * Ln2;
-
- /* Pipelined polynomial evaluation to approximate log1p(r). */
- r2 = r * r;
- y = A[1] * r + A[2];
- y = A[0] * r2 + y;
- y = y * r2 + (y0 + r);
- return eval_as_float (y);
-}
diff --git a/pl/math/logf_data.c b/pl/math/logf_data.c
deleted file mode 100644
index 97d9eb8d0097..000000000000
--- a/pl/math/logf_data.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Data definition for logf and log10f.
- *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-const struct logf_data __logf_data = {
- .tab =
- {
- {0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2},
- {0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2},
- {0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2},
- {0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3},
- {0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3},
- {0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3},
- {0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4},
- {0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4},
- {0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5},
- {0x1p+0, 0x0p+0},
- {0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5},
- {0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4},
- {0x1.b2036576afce6p-1, 0x1.526e57720db08p-3},
- {0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3},
- {0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2},
- {0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2},
- },
- .ln2 = 0x1.62e42fefa39efp-1,
- .invln10 = 0x1.bcb7b1526e50ep-2,
- .poly = {
- -0x1.00ea348b88334p-2,
- 0x1.5575b0be00b6ap-2,
- -0x1.ffffef20a4123p-2,
- }};
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
deleted file mode 100644
index c3dd8f2db8c7..000000000000
--- a/pl/math/math_config.h
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * Configuration for math routines.
- *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _MATH_CONFIG_H
-#define _MATH_CONFIG_H
-
-#include <math.h>
-#include <stdint.h>
-
-#ifndef WANT_ROUNDING
-/* If defined to 1, return correct results for special cases in non-nearest
- rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than
- -0.0f). This may be set to 0 if there is no fenv support or if math
- functions only get called in round to nearest mode. */
-# define WANT_ROUNDING 1
-#endif
-#ifndef WANT_ERRNO
-/* If defined to 1, set errno in math functions according to ISO C. Many math
- libraries do not set errno, so this is 0 by default. It may need to be
- set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */
-# define WANT_ERRNO 0
-#endif
-#ifndef WANT_SIMD_EXCEPT
-/* If defined to 1, trigger fp exceptions in vector routines, consistently with
- behaviour expected from the corresponding scalar routine. */
-# define WANT_SIMD_EXCEPT 0
-#endif
-
-/* Compiler can inline round as a single instruction. */
-#ifndef HAVE_FAST_ROUND
-# if __aarch64__
-# define HAVE_FAST_ROUND 1
-# else
-# define HAVE_FAST_ROUND 0
-# endif
-#endif
-
-/* Compiler can inline lround, but not (long)round(x). */
-#ifndef HAVE_FAST_LROUND
-# if __aarch64__ && (100 * __GNUC__ + __GNUC_MINOR__) >= 408 \
- && __NO_MATH_ERRNO__
-# define HAVE_FAST_LROUND 1
-# else
-# define HAVE_FAST_LROUND 0
-# endif
-#endif
-
-/* Compiler can inline fma as a single instruction. */
-#ifndef HAVE_FAST_FMA
-# if defined FP_FAST_FMA || __aarch64__
-# define HAVE_FAST_FMA 1
-# else
-# define HAVE_FAST_FMA 0
-# endif
-#endif
-
-/* Provide *_finite symbols and some of the glibc hidden symbols
- so libmathlib can be used with binaries compiled against glibc
- to interpose math functions with both static and dynamic linking. */
-#ifndef USE_GLIBC_ABI
-# if __GNUC__
-# define USE_GLIBC_ABI 1
-# else
-# define USE_GLIBC_ABI 0
-# endif
-#endif
-
-/* Optionally used extensions. */
-#ifdef __GNUC__
-# define HIDDEN __attribute__ ((__visibility__ ("hidden")))
-# define NOINLINE __attribute__ ((noinline))
-# define UNUSED __attribute__ ((unused))
-# define likely(x) __builtin_expect (!!(x), 1)
-# define unlikely(x) __builtin_expect (x, 0)
-# if __GNUC__ >= 9
-# define attribute_copy(f) __attribute__ ((copy (f)))
-# else
-# define attribute_copy(f)
-# endif
-# define strong_alias(f, a) \
- extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
-# define hidden_alias(f, a) \
- extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
- attribute_copy (f);
-#else
-# define HIDDEN
-# define NOINLINE
-# define UNUSED
-# define likely(x) (x)
-# define unlikely(x) (x)
-#endif
-
-/* Return ptr but hide its value from the compiler so accesses through it
- cannot be optimized based on the contents. */
-#define ptr_barrier(ptr) \
- ({ \
- __typeof (ptr) __ptr = (ptr); \
- __asm("" : "+r"(__ptr)); \
- __ptr; \
- })
-
-/* Symbol renames to avoid libc conflicts. */
-#define __math_oflowf arm_math_oflowf
-#define __math_uflowf arm_math_uflowf
-#define __math_may_uflowf arm_math_may_uflowf
-#define __math_divzerof arm_math_divzerof
-#define __math_oflow arm_math_oflow
-#define __math_uflow arm_math_uflow
-#define __math_may_uflow arm_math_may_uflow
-#define __math_divzero arm_math_divzero
-#define __math_invalidf arm_math_invalidf
-#define __math_invalid arm_math_invalid
-#define __math_check_oflow arm_math_check_oflow
-#define __math_check_uflow arm_math_check_uflow
-#define __math_check_oflowf arm_math_check_oflowf
-#define __math_check_uflowf arm_math_check_uflowf
-
-#if HAVE_FAST_ROUND
-/* When set, the roundtoint and converttoint functions are provided with
- the semantics documented below. */
-# define TOINT_INTRINSICS 1
-
-/* Round x to nearest int in all rounding modes, ties have to be rounded
- consistently with converttoint so the results match. If the result
- would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */
-static inline double_t
-roundtoint (double_t x)
-{
- return round (x);
-}
-
-/* Convert x to nearest int in all rounding modes, ties have to be rounded
- consistently with roundtoint. If the result is not representible in an
- int32_t then the semantics is unspecified. */
-static inline int32_t
-converttoint (double_t x)
-{
-# if HAVE_FAST_LROUND
- return lround (x);
-# else
- return (long) round (x);
-# endif
-}
-#endif
-
-static inline uint32_t
-asuint (float f)
-{
- union
- {
- float f;
- uint32_t i;
- } u = { f };
- return u.i;
-}
-
-static inline float
-asfloat (uint32_t i)
-{
- union
- {
- uint32_t i;
- float f;
- } u = { i };
- return u.f;
-}
-
-static inline uint64_t
-asuint64 (double f)
-{
- union
- {
- double f;
- uint64_t i;
- } u = { f };
- return u.i;
-}
-
-static inline double
-asdouble (uint64_t i)
-{
- union
- {
- uint64_t i;
- double f;
- } u = { i };
- return u.f;
-}
-
-#ifndef IEEE_754_2008_SNAN
-# define IEEE_754_2008_SNAN 1
-#endif
-static inline int
-issignalingf_inline (float x)
-{
- uint32_t ix = asuint (x);
- if (!IEEE_754_2008_SNAN)
- return (ix & 0x7fc00000) == 0x7fc00000;
- return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000;
-}
-
-static inline int
-issignaling_inline (double x)
-{
- uint64_t ix = asuint64 (x);
- if (!IEEE_754_2008_SNAN)
- return (ix & 0x7ff8000000000000) == 0x7ff8000000000000;
- return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
-}
-
-#if __aarch64__ && __GNUC__
-/* Prevent the optimization of a floating-point expression. */
-static inline float
-opt_barrier_float (float x)
-{
- __asm__ __volatile__ ("" : "+w" (x));
- return x;
-}
-static inline double
-opt_barrier_double (double x)
-{
- __asm__ __volatile__ ("" : "+w" (x));
- return x;
-}
-/* Force the evaluation of a floating-point expression for its side-effect. */
-static inline void
-force_eval_float (float x)
-{
- __asm__ __volatile__ ("" : "+w" (x));
-}
-static inline void
-force_eval_double (double x)
-{
- __asm__ __volatile__ ("" : "+w" (x));
-}
-#else
-static inline float
-opt_barrier_float (float x)
-{
- volatile float y = x;
- return y;
-}
-static inline double
-opt_barrier_double (double x)
-{
- volatile double y = x;
- return y;
-}
-static inline void
-force_eval_float (float x)
-{
- volatile float y UNUSED = x;
-}
-static inline void
-force_eval_double (double x)
-{
- volatile double y UNUSED = x;
-}
-#endif
-
-/* Evaluate an expression as the specified type, normally a type
- cast should be enough, but compilers implement non-standard
- excess-precision handling, so when FLT_EVAL_METHOD != 0 then
- these functions may need to be customized. */
-static inline float
-eval_as_float (float x)
-{
- return x;
-}
-static inline double
-eval_as_double (double x)
-{
- return x;
-}
-
-/* Error handling tail calls for special cases, with a sign argument.
- The sign of the return value is set if the argument is non-zero. */
-
-/* The result overflows. */
-HIDDEN float __math_oflowf (uint32_t);
-/* The result underflows to 0 in nearest rounding mode. */
-HIDDEN float __math_uflowf (uint32_t);
-/* The result underflows to 0 in some directed rounding mode only. */
-HIDDEN float __math_may_uflowf (uint32_t);
-/* Division by zero. */
-HIDDEN float __math_divzerof (uint32_t);
-/* The result overflows. */
-HIDDEN double __math_oflow (uint32_t);
-/* The result underflows to 0 in nearest rounding mode. */
-HIDDEN double __math_uflow (uint32_t);
-/* The result underflows to 0 in some directed rounding mode only. */
-HIDDEN double __math_may_uflow (uint32_t);
-/* Division by zero. */
-HIDDEN double __math_divzero (uint32_t);
-
-/* Error handling using input checking. */
-
-/* Invalid input unless it is a quiet NaN. */
-HIDDEN float __math_invalidf (float);
-/* Invalid input unless it is a quiet NaN. */
-HIDDEN double __math_invalid (double);
-
-/* Error handling using output checking, only for errno setting. */
-
-/* Check if the result overflowed to infinity. */
-HIDDEN double __math_check_oflow (double);
-/* Check if the result underflowed to 0. */
-HIDDEN double __math_check_uflow (double);
-
-/* Check if the result overflowed to infinity. */
-static inline double
-check_oflow (double x)
-{
- return WANT_ERRNO ? __math_check_oflow (x) : x;
-}
-
-/* Check if the result underflowed to 0. */
-static inline double
-check_uflow (double x)
-{
- return WANT_ERRNO ? __math_check_uflow (x) : x;
-}
-
-/* Check if the result overflowed to infinity. */
-HIDDEN float __math_check_oflowf (float);
-/* Check if the result underflowed to 0. */
-HIDDEN float __math_check_uflowf (float);
-
-/* Check if the result overflowed to infinity. */
-static inline float
-check_oflowf (float x)
-{
- return WANT_ERRNO ? __math_check_oflowf (x) : x;
-}
-
-/* Check if the result underflowed to 0. */
-static inline float
-check_uflowf (float x)
-{
- return WANT_ERRNO ? __math_check_uflowf (x) : x;
-}
-
-extern const struct erff_data
-{
- struct
- {
- float erf, scale;
- } tab[513];
-} __erff_data HIDDEN;
-
-extern const struct sv_erff_data
-{
- float erf[513];
- float scale[513];
-} __sv_erff_data HIDDEN;
-
-extern const struct erfcf_data
-{
- struct
- {
- float erfc, scale;
- } tab[645];
-} __erfcf_data HIDDEN;
-
-/* Data for logf and log10f. */
-#define LOGF_TABLE_BITS 4
-#define LOGF_POLY_ORDER 4
-extern const struct logf_data
-{
- struct
- {
- double invc, logc;
- } tab[1 << LOGF_TABLE_BITS];
- double ln2;
- double invln10;
- double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */
-} __logf_data HIDDEN;
-
-/* Data for low accuracy log10 (with 1/ln(10) included in coefficients). */
-#define LOG10_TABLE_BITS 7
-#define LOG10_POLY_ORDER 6
-#define LOG10_POLY1_ORDER 12
-extern const struct log10_data
-{
- double ln2hi;
- double ln2lo;
- double invln10;
- double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */
- double poly1[LOG10_POLY1_ORDER - 1];
- struct
- {
- double invc, logc;
- } tab[1 << LOG10_TABLE_BITS];
-#if !HAVE_FAST_FMA
- struct
- {
- double chi, clo;
- } tab2[1 << LOG10_TABLE_BITS];
-#endif
-} __log10_data HIDDEN;
-
-#define EXP_TABLE_BITS 7
-#define EXP_POLY_ORDER 5
-/* Use polynomial that is optimized for a wider input range. This may be
- needed for good precision in non-nearest rounding and !TOINT_INTRINSICS. */
-#define EXP_POLY_WIDE 0
-/* Use close to nearest rounding toint when !TOINT_INTRINSICS. This may be
- needed for good precision in non-nearest rouning and !EXP_POLY_WIDE. */
-#define EXP_USE_TOINT_NARROW 0
-#define EXP2_POLY_ORDER 5
-#define EXP2_POLY_WIDE 0
-extern const struct exp_data
-{
- double invln2N;
- double shift;
- double negln2hiN;
- double negln2loN;
- double poly[4]; /* Last four coefficients. */
- double exp2_shift;
- double exp2_poly[EXP2_POLY_ORDER];
- uint64_t tab[2 * (1 << EXP_TABLE_BITS)];
-} __exp_data HIDDEN;
-
-/* Copied from math/v_exp.h for use in vector exp_tail. */
-#define V_EXP_TAIL_TABLE_BITS 8
-extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
-
-/* Copied from math/v_exp.h for use in vector exp2. */
-#define V_EXP_TABLE_BITS 7
-extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
-
-extern const struct erf_data
-{
- struct
- {
- double erf, scale;
- } tab[769];
-} __erf_data HIDDEN;
-
-extern const struct sv_erf_data
-{
- double erf[769];
- double scale[769];
-} __sv_erf_data HIDDEN;
-
-extern const struct erfc_data
-{
- struct
- {
- double erfc, scale;
- } tab[3488];
-} __erfc_data HIDDEN;
-
-#define ATAN_POLY_NCOEFFS 20
-extern const struct atan_poly_data
-{
- double poly[ATAN_POLY_NCOEFFS];
-} __atan_poly_data HIDDEN;
-
-#define ATANF_POLY_NCOEFFS 8
-extern const struct atanf_poly_data
-{
- float poly[ATANF_POLY_NCOEFFS];
-} __atanf_poly_data HIDDEN;
-
-#define ASINHF_NCOEFFS 8
-extern const struct asinhf_data
-{
- float coeffs[ASINHF_NCOEFFS];
-} __asinhf_data HIDDEN;
-
-#define LOG_TABLE_BITS 7
-#define LOG_POLY_ORDER 6
-#define LOG_POLY1_ORDER 12
-extern const struct log_data
-{
- double ln2hi;
- double ln2lo;
- double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1. */
- double poly1[LOG_POLY1_ORDER - 1];
- struct
- {
- double invc, logc;
- } tab[1 << LOG_TABLE_BITS];
-#if !HAVE_FAST_FMA
- struct
- {
- double chi, clo;
- } tab2[1 << LOG_TABLE_BITS];
-#endif
-} __log_data HIDDEN;
-
-#define ASINH_NCOEFFS 18
-extern const struct asinh_data
-{
- double poly[ASINH_NCOEFFS];
-} __asinh_data HIDDEN;
-
-#define LOG1P_NCOEFFS 19
-extern const struct log1p_data
-{
- double coeffs[LOG1P_NCOEFFS];
-} __log1p_data HIDDEN;
-
-#define LOG1PF_2U5
-#define LOG1PF_NCOEFFS 9
-extern const struct log1pf_data
-{
- float coeffs[LOG1PF_NCOEFFS];
-} __log1pf_data HIDDEN;
-
-#define TANF_P_POLY_NCOEFFS 6
-/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps. */
-#define TANF_Q_POLY_NCOEFFS 4
-extern const struct tanf_poly_data
-{
- float poly_tan[TANF_P_POLY_NCOEFFS];
- float poly_cotan[TANF_Q_POLY_NCOEFFS];
-} __tanf_poly_data HIDDEN;
-
-#define V_LOG2_TABLE_BITS 7
-extern const struct v_log2_data
-{
- double poly[5];
- double invln2;
- struct
- {
- double invc, log2c;
- } table[1 << V_LOG2_TABLE_BITS];
-} __v_log2_data HIDDEN;
-
-#define V_LOG10_TABLE_BITS 7
-extern const struct v_log10_data
-{
- double poly[5];
- double invln10, log10_2;
- struct
- {
- double invc, log10c;
- } table[1 << V_LOG10_TABLE_BITS];
-} __v_log10_data HIDDEN;
-
-/* Some data for SVE powf's internal exp and log. */
-#define V_POWF_EXP2_TABLE_BITS 5
-#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS)
-#define V_POWF_LOG2_TABLE_BITS 5
-#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS)
-extern const struct v_powf_data
-{
- double invc[V_POWF_LOG2_N];
- double logc[V_POWF_LOG2_N];
- uint64_t scale[V_POWF_EXP2_N];
-} __v_powf_data HIDDEN;
-
-#define V_LOG_POLY_ORDER 6
-#define V_LOG_TABLE_BITS 7
-extern const struct v_log_data
-{
- /* Shared data for vector log and log-derived routines (e.g. asinh). */
- double poly[V_LOG_POLY_ORDER - 1];
- double ln2;
- struct
- {
- double invc, logc;
- } table[1 << V_LOG_TABLE_BITS];
-} __v_log_data HIDDEN;
-
-#define EXPM1F_POLY_ORDER 5
-extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
-
-#define EXPF_TABLE_BITS 5
-#define EXPF_POLY_ORDER 3
-extern const struct expf_data
-{
- uint64_t tab[1 << EXPF_TABLE_BITS];
- double invln2_scaled;
- double poly_scaled[EXPF_POLY_ORDER];
-} __expf_data HIDDEN;
-
-#define EXPM1_POLY_ORDER 11
-extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN;
-
-extern const struct cbrtf_data
-{
- float poly[4];
- float table[5];
-} __cbrtf_data HIDDEN;
-
-extern const struct cbrt_data
-{
- double poly[4];
- double table[5];
-} __cbrt_data HIDDEN;
-
-#define ASINF_POLY_ORDER 4
-extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN;
-
-#define ASIN_POLY_ORDER 11
-extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN;
-
-/* Some data for AdvSIMD and SVE pow's internal exp and log. */
-#define V_POW_EXP_TABLE_BITS 8
-extern const struct v_pow_exp_data
-{
- double poly[3];
- double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift;
- uint64_t sbits[1 << V_POW_EXP_TABLE_BITS];
-} __v_pow_exp_data HIDDEN;
-
-#define V_POW_LOG_TABLE_BITS 7
-extern const struct v_pow_log_data
-{
- double poly[7]; /* First coefficient is 1. */
- double ln2_hi, ln2_lo;
- double invc[1 << V_POW_LOG_TABLE_BITS];
- double logc[1 << V_POW_LOG_TABLE_BITS];
- double logctail[1 << V_POW_LOG_TABLE_BITS];
-} __v_pow_log_data HIDDEN;
-
-#endif
diff --git a/pl/math/math_err.c b/pl/math/math_err.c
deleted file mode 100644
index 74db54a5b2cd..000000000000
--- a/pl/math/math_err.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Double-precision math error handling.
- *
- * Copyright (c) 2018-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#if WANT_ERRNO
-# include <errno.h>
-/* NOINLINE reduces code size and avoids making math functions non-leaf
- when the error handling is inlined. */
-NOINLINE static double
-with_errno (double y, int e)
-{
- errno = e;
- return y;
-}
-#else
-# define with_errno(x, e) (x)
-#endif
-
-/* NOINLINE reduces code size. */
-NOINLINE static double
-xflow (uint32_t sign, double y)
-{
- y = eval_as_double (opt_barrier_double (sign ? -y : y) * y);
- return with_errno (y, ERANGE);
-}
-
-HIDDEN double
-__math_uflow (uint32_t sign)
-{
- return xflow (sign, 0x1p-767);
-}
-
-/* Underflows to zero in some non-nearest rounding mode, setting errno
- is valid even if the result is non-zero, but in the subnormal range. */
-HIDDEN double
-__math_may_uflow (uint32_t sign)
-{
- return xflow (sign, 0x1.8p-538);
-}
-
-HIDDEN double
-__math_oflow (uint32_t sign)
-{
- return xflow (sign, 0x1p769);
-}
-
-HIDDEN double
-__math_divzero (uint32_t sign)
-{
- double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0;
- return with_errno (y, ERANGE);
-}
-
-HIDDEN double
-__math_invalid (double x)
-{
- double y = (x - x) / (x - x);
- return isnan (x) ? y : with_errno (y, EDOM);
-}
-
-/* Check result and set errno if necessary. */
-
-HIDDEN double
-__math_check_uflow (double y)
-{
- return y == 0.0 ? with_errno (y, ERANGE) : y;
-}
-
-HIDDEN double
-__math_check_oflow (double y)
-{
- return isinf (y) ? with_errno (y, ERANGE) : y;
-}
diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c
deleted file mode 100644
index 2b8c6bd25753..000000000000
--- a/pl/math/math_errf.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Single-precision math error handling.
- *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#if WANT_ERRNO
-# include <errno.h>
-/* NOINLINE reduces code size and avoids making math functions non-leaf
- when the error handling is inlined. */
-NOINLINE static float
-with_errnof (float y, int e)
-{
- errno = e;
- return y;
-}
-#else
-# define with_errnof(x, e) (x)
-#endif
-
-/* NOINLINE reduces code size. */
-NOINLINE static float
-xflowf (uint32_t sign, float y)
-{
- y = eval_as_float (opt_barrier_float (sign ? -y : y) * y);
- return with_errnof (y, ERANGE);
-}
-
-HIDDEN float
-__math_uflowf (uint32_t sign)
-{
- return xflowf (sign, 0x1p-95f);
-}
-
-/* Underflows to zero in some non-nearest rounding mode, setting errno
- is valid even if the result is non-zero, but in the subnormal range. */
-HIDDEN float
-__math_may_uflowf (uint32_t sign)
-{
- return xflowf (sign, 0x1.4p-75f);
-}
-
-HIDDEN float
-__math_oflowf (uint32_t sign)
-{
- return xflowf (sign, 0x1p97f);
-}
-
-HIDDEN float
-__math_divzerof (uint32_t sign)
-{
- float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f;
- return with_errnof (y, ERANGE);
-}
-
-HIDDEN float
-__math_invalidf (float x)
-{
- float y = (x - x) / (x - x);
- return isnan (x) ? y : with_errnof (y, EDOM);
-}
-
-/* Check result and set errno if necessary. */
-
-HIDDEN float
-__math_check_uflowf (float y)
-{
- return y == 0.0f ? with_errnof (y, ERANGE) : y;
-}
-
-HIDDEN float
-__math_check_oflowf (float y)
-{
- return isinf (y) ? with_errnof (y, ERANGE) : y;
-}
diff --git a/pl/math/pl_sig.h b/pl/math/pl_sig.h
deleted file mode 100644
index 52d988f0e1ce..000000000000
--- a/pl/math/pl_sig.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * PL macros for emitting various ulp/bench entries based on function signature
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
- */
-
-#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
-#define V_NAME_D1(fun) _ZGVnN2v_##fun
-#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
-#define V_NAME_D2(fun) _ZGVnN2vv_##fun
-
-#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f
-#define SV_NAME_D1(fun) _ZGVsMxv_##fun
-#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f
-#define SV_NAME_D2(fun) _ZGVsMxvv_##fun
-
-#define PL_DECL_SF1(fun) float fun##f (float);
-#define PL_DECL_SF2(fun) float fun##f (float, float);
-#define PL_DECL_SD1(fun) double fun (double);
-#define PL_DECL_SD2(fun) double fun (double, double);
-
-#if WANT_VMATH
-# define PL_DECL_VF1(fun) \
- VPCS_ATTR float32x4_t V_NAME_F1 (fun##f) (float32x4_t);
-# define PL_DECL_VF2(fun) \
- VPCS_ATTR float32x4_t V_NAME_F2 (fun##f) (float32x4_t, float32x4_t);
-# define PL_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t);
-# define PL_DECL_VD2(fun) \
- VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t);
-#else
-# define PL_DECL_VF1(fun)
-# define PL_DECL_VF2(fun)
-# define PL_DECL_VD1(fun)
-# define PL_DECL_VD2(fun)
-#endif
-
-#if WANT_SVE_MATH
-# define PL_DECL_SVF1(fun) \
- svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t);
-# define PL_DECL_SVF2(fun) \
- svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t);
-# define PL_DECL_SVD1(fun) \
- svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t);
-# define PL_DECL_SVD2(fun) \
- svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t);
-#else
-# define PL_DECL_SVF1(fun)
-# define PL_DECL_SVF2(fun)
-# define PL_DECL_SVD1(fun)
-# define PL_DECL_SVD2(fun)
-#endif
-
-/* For building the routines, emit function prototype from PL_SIG. This
- ensures that the correct signature has been chosen (wrong one will be a
- compile error). PL_SIG is defined differently by various components of the
- build system to emit entries in the wrappers and entries for mathbench and
- ulp. */
-#define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f)
diff --git a/pl/math/sv_acosh_3u5.c b/pl/math/sv_acosh_3u5.c
deleted file mode 100644
index faf351331464..000000000000
--- a/pl/math/sv_acosh_3u5.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Double-precision SVE acosh(x) function.
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define WANT_SV_LOG1P_K0_SHORTCUT 1
-#include "sv_log1p_inline.h"
-
-#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */
-#define OneTop 0x3ff
-
-static NOINLINE svfloat64_t
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
- return sv_call_f64 (acosh, x, y, special);
-}
-
-/* SVE approximation for double-precision acosh, based on log1p.
- The largest observed error is 3.19 ULP in the region where the
- argument to log1p falls in the k=0 interval, i.e. x close to 1:
- SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
- want 0x1.ed23399f51373p-2. */
-svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
-{
- svuint64_t itop = svlsr_x (pg, svreinterpret_u64 (x), 52);
- /* (itop - OneTop) >= (BigBoundTop - OneTop). */
- svbool_t special = svcmpge (pg, svsub_x (pg, itop, OneTop), sv_u64 (0x1ff));
-
- svfloat64_t xm1 = svsub_x (pg, x, 1);
- svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1));
- svfloat64_t y = sv_log1p_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
-
- /* Fall back to scalar routine for special lanes. */
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
-
- return y;
-}
-
-PL_SIG (SV, D, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (acosh), 2.69)
-PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000)
-PL_TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000)
diff --git a/pl/math/sv_acoshf_2u8.c b/pl/math/sv_acoshf_2u8.c
deleted file mode 100644
index f527083af40a..000000000000
--- a/pl/math/sv_acoshf_2u8.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Single-precision SVE acosh(x) function.
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define One 0x3f800000
-#define Thres 0x20000000 /* asuint(0x1p64) - One. */
-
-#include "sv_log1pf_inline.h"
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (acoshf, x, y, special);
-}
-
-/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
- vector acoshf and log1p.
-
- Maximum error is 2.78 ULPs:
- SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
- want 0x1.f45b3cp-4. */
-svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
-{
- svuint32_t ix = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
-
- svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
- svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
- svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
- return y;
-}
-
-PL_SIG (SV, F, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (acosh), 2.29)
-PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500)
-PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000)
-PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000)
-PL_TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000)
diff --git a/pl/math/sv_asinh_3u0.c b/pl/math/sv_asinh_3u0.c
deleted file mode 100644
index 711f0dfdbedc..000000000000
--- a/pl/math/sv_asinh_3u0.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Double-precision SVE asinh(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define OneTop sv_u64 (0x3ff) /* top12(asuint64(1.0f)). */
-#define HugeBound sv_u64 (0x5fe) /* top12(asuint64(0x1p511)). */
-#define TinyBound (0x3e5) /* top12(asuint64(0x1p-26)). */
-#define SignMask (0x8000000000000000)
-
-/* Constants & data for log. */
-#define A(i) __v_log_data.poly[i]
-#define Ln2 (0x1.62e42fefa39efp-1)
-#define N (1 << V_LOG_TABLE_BITS)
-#define OFF (0x3fe6900900000000)
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
- return sv_call_f64 (asinh, x, y, special);
-}
-
-static inline svfloat64_t
-__sv_log_inline (svfloat64_t x, const svbool_t pg)
-{
- /* Double-precision SVE log, copied from pl/math/sv_log_2u5.c with some
- cosmetic modification and special-cases removed. See that file for details
- of the algorithm used. */
- svuint64_t ix = svreinterpret_u64 (x);
- svuint64_t tmp = svsub_x (pg, ix, OFF);
- svuint64_t i
- = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
- svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
- svfloat64_t z = svreinterpret_f64 (iz);
- svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
- svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
- svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
- svfloat64_t kd = svcvt_f64_x (pg, k);
- svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, Ln2);
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = svmla_x (pg, sv_f64 (A (2)), r, A (3));
- svfloat64_t p = svmla_x (pg, sv_f64 (A (0)), r, A (1));
- y = svmla_x (pg, y, r2, A (4));
- y = svmla_x (pg, p, r2, y);
- y = svmla_x (pg, hi, r2, y);
- return y;
-}
-
-/* Double-precision implementation of SVE asinh(x).
- asinh is very sensitive around 1, so it is impractical to devise a single
- low-cost algorithm which is sufficiently accurate on a wide range of input.
- Instead we use two different algorithms:
- asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
- = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
- where log(x) is an optimized log approximation, and P(x) is a polynomial
- shared with the scalar routine. The greatest observed error 2.51 ULP, in
- |x| >= 1:
- _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1
- want 0x1.e3181c43b0f39p-1. */
-svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
-{
- svuint64_t ix = svreinterpret_u64 (x);
- svuint64_t iax = svbic_x (pg, ix, SignMask);
- svuint64_t sign = svand_x (pg, ix, SignMask);
- svfloat64_t ax = svreinterpret_f64 (iax);
- svuint64_t top12 = svlsr_x (pg, iax, 52);
-
- svbool_t ge1 = svcmpge (pg, top12, OneTop);
- svbool_t special = svcmpge (pg, top12, HugeBound);
-
- /* Option 1: |x| >= 1.
- Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */
- svfloat64_t option_1 = sv_f64 (0);
- if (likely (svptest_any (pg, ge1)))
- {
- svfloat64_t axax = svmul_x (pg, ax, ax);
- option_1 = __sv_log_inline (
- svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, axax, 1))), pg);
- }
-
- /* Option 2: |x| < 1.
- Compute asinh(x) using a polynomial.
- The largest observed error in this region is 1.51 ULPs:
- _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
- want 0x1.c1e649ee2681dp-1. */
- svfloat64_t option_2 = sv_f64 (0);
- if (likely (svptest_any (pg, svnot_z (pg, ge1))))
- {
- svfloat64_t x2 = svmul_x (pg, ax, ax);
- svfloat64_t z2 = svmul_x (pg, x2, x2);
- svfloat64_t z4 = svmul_x (pg, z2, z2);
- svfloat64_t z8 = svmul_x (pg, z4, z4);
- svfloat64_t z16 = svmul_x (pg, z8, z8);
- svfloat64_t p
- = sv_estrin_17_f64_x (pg, x2, z2, z4, z8, z16, __asinh_data.poly);
- option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
- }
-
- /* Choose the right option for each lane. */
- svfloat64_t y = svsel (ge1, option_1, option_2);
-
- /* Apply sign of x to y. */
- y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
- return y;
-}
-
-PL_SIG (SV, D, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_D1 (asinh), 2.52)
-/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
- Ensures the svsel is choosing the right option in all cases. */
-#define SV_ASINH_INTERVAL(lo, hi, n) \
- PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0.5) \
- PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 2) \
- PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0x1p600)
-SV_ASINH_INTERVAL (0, 0x1p-26, 50000)
-SV_ASINH_INTERVAL (0x1p-26, 1, 50000)
-SV_ASINH_INTERVAL (1, 0x1p511, 50000)
-SV_ASINH_INTERVAL (0x1p511, inf, 40000)
diff --git a/pl/math/sv_coshf_2u.c b/pl/math/sv_coshf_2u.c
deleted file mode 100644
index 81680fef318e..000000000000
--- a/pl/math/sv_coshf_2u.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Single-precision SVE cosh(x) function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#include "sv_expf_inline.h"
-
-static const struct data
-{
- struct sv_expf_data expf_consts;
- uint32_t special_bound;
-} data = {
- .expf_consts = SV_EXPF_DATA,
- /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
- .special_bound = 0x42ad496c,
-};
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
-{
- return sv_call_f32 (coshf, x, y, pg);
-}
-
-/* Single-precision vector cosh, using vector expf.
- Maximum error is 1.89 ULP:
- _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
- want 0x1.f00adcp+127. */
-svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
-
- svfloat32_t ax = svabs_x (pg, x);
- svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
-
- /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
- svfloat32_t half_t = svmul_x (pg, t, 0.5);
- svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
-
- return svadd_x (pg, half_t, half_over_t);
-}
-
-PL_SIG (SV, F, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (cosh), 1.39)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/sv_erf_data.c b/pl/math/sv_erf_data.c
deleted file mode 100644
index 7244aceda5a5..000000000000
--- a/pl/math/sv_erf_data.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*
- * Data for approximation of erf.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Lookup table used in vector erf.
- For each possible rounded input r (multiples of 1/128), between
- r = 0.0 and r = 6.0 (769 values):
- - the first entry __erf_data.tab.erf contains the values of erf(r),
- - the second entry __erf_data.tab.scale contains the values of
- 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
- algorithm, since lookup is performed only for x >= 1/64-1/512. */
-const struct sv_erf_data __sv_erf_data = {
- .erf = { 0x0.0000000000000p+0,
- 0x1.20dbf3deb1340p-7,
- 0x1.20d77083f17a0p-6,
- 0x1.b137e0cf584dcp-6,
- 0x1.20c5645dd2538p-5,
- 0x1.68e5d3bbc9526p-5,
- 0x1.b0fafef135745p-5,
- 0x1.f902a77bd3821p-5,
- 0x1.207d480e90658p-4,
- 0x1.44703e87e8593p-4,
- 0x1.68591a1e83b5dp-4,
- 0x1.8c36beb8a8d23p-4,
- 0x1.b0081148a873ap-4,
- 0x1.d3cbf7e70a4b3p-4,
- 0x1.f78159ec8bb50p-4,
- 0x1.0d939005f65e5p-3,
- 0x1.1f5e1a35c3b89p-3,
- 0x1.311fc15f56d14p-3,
- 0x1.42d7fc2f64959p-3,
- 0x1.548642321d7c6p-3,
- 0x1.662a0bdf7a89fp-3,
- 0x1.77c2d2a765f9ep-3,
- 0x1.895010fdbdbfdp-3,
- 0x1.9ad142662e14dp-3,
- 0x1.ac45e37fe2526p-3,
- 0x1.bdad72110a648p-3,
- 0x1.cf076d1233237p-3,
- 0x1.e05354b96ff36p-3,
- 0x1.f190aa85540e2p-3,
- 0x1.015f78a3dcf3dp-2,
- 0x1.09eed6982b948p-2,
- 0x1.127631eb8de32p-2,
- 0x1.1af54e232d609p-2,
- 0x1.236bef825d9a2p-2,
- 0x1.2bd9db0f7827fp-2,
- 0x1.343ed6989b7d9p-2,
- 0x1.3c9aa8b84bedap-2,
- 0x1.44ed18d9f6462p-2,
- 0x1.4d35ef3e5372ep-2,
- 0x1.5574f4ffac98ep-2,
- 0x1.5da9f415ff23fp-2,
- 0x1.65d4b75b00471p-2,
- 0x1.6df50a8dff772p-2,
- 0x1.760aba57a76bfp-2,
- 0x1.7e15944d9d3e4p-2,
- 0x1.861566f5fd3c0p-2,
- 0x1.8e0a01cab516bp-2,
- 0x1.95f3353cbb146p-2,
- 0x1.9dd0d2b721f39p-2,
- 0x1.a5a2aca209394p-2,
- 0x1.ad68966569a87p-2,
- 0x1.b522646bbda68p-2,
- 0x1.bccfec24855b8p-2,
- 0x1.c4710406a65fcp-2,
- 0x1.cc058392a6d2dp-2,
- 0x1.d38d4354c3bd0p-2,
- 0x1.db081ce6e2a48p-2,
- 0x1.e275eaf25e458p-2,
- 0x1.e9d68931ae650p-2,
- 0x1.f129d471eabb1p-2,
- 0x1.f86faa9428f9dp-2,
- 0x1.ffa7ea8eb5fd0p-2,
- 0x1.03693a371519cp-1,
- 0x1.06f794ab2cae7p-1,
- 0x1.0a7ef5c18edd2p-1,
- 0x1.0dff4f247f6c6p-1,
- 0x1.1178930ada115p-1,
- 0x1.14eab43841b55p-1,
- 0x1.1855a5fd3dd50p-1,
- 0x1.1bb95c3746199p-1,
- 0x1.1f15cb50bc4dep-1,
- 0x1.226ae840d4d70p-1,
- 0x1.25b8a88b6dd7fp-1,
- 0x1.28ff0240d52cdp-1,
- 0x1.2c3debfd7d6c1p-1,
- 0x1.2f755ce9a21f4p-1,
- 0x1.32a54cb8db67bp-1,
- 0x1.35cdb3a9a144dp-1,
- 0x1.38ee8a84beb71p-1,
- 0x1.3c07ca9cb4f9ep-1,
- 0x1.3f196dcd0f135p-1,
- 0x1.42236e79a5fa6p-1,
- 0x1.4525c78dd5966p-1,
- 0x1.4820747ba2dc2p-1,
- 0x1.4b13713ad3513p-1,
- 0x1.4dfeba47f63ccp-1,
- 0x1.50e24ca35fd2cp-1,
- 0x1.53be25d016a4fp-1,
- 0x1.569243d2b3a9bp-1,
- 0x1.595ea53035283p-1,
- 0x1.5c2348ecc4dc3p-1,
- 0x1.5ee02e8a71a53p-1,
- 0x1.61955607dd15dp-1,
- 0x1.6442bfdedd397p-1,
- 0x1.66e86d0312e82p-1,
- 0x1.69865ee075011p-1,
- 0x1.6c1c9759d0e5fp-1,
- 0x1.6eab18c74091bp-1,
- 0x1.7131e5f496a5ap-1,
- 0x1.73b1021fc0cb8p-1,
- 0x1.762870f720c6fp-1,
- 0x1.78983697dc96fp-1,
- 0x1.7b00578c26037p-1,
- 0x1.7d60d8c979f7bp-1,
- 0x1.7fb9bfaed8078p-1,
- 0x1.820b1202f27fbp-1,
- 0x1.8454d5f25760dp-1,
- 0x1.8697120d92a4ap-1,
- 0x1.88d1cd474a2e0p-1,
- 0x1.8b050ef253c37p-1,
- 0x1.8d30debfc572ep-1,
- 0x1.8f5544bd00c04p-1,
- 0x1.91724951b8fc6p-1,
- 0x1.9387f53df5238p-1,
- 0x1.959651980da31p-1,
- 0x1.979d67caa6631p-1,
- 0x1.999d4192a5715p-1,
- 0x1.9b95e8fd26abap-1,
- 0x1.9d8768656cc42p-1,
- 0x1.9f71ca72cffb6p-1,
- 0x1.a1551a16aaeafp-1,
- 0x1.a331628a45b92p-1,
- 0x1.a506af4cc00f4p-1,
- 0x1.a6d50c20fa293p-1,
- 0x1.a89c850b7d54dp-1,
- 0x1.aa5d265064366p-1,
- 0x1.ac16fc7143263p-1,
- 0x1.adca142b10f98p-1,
- 0x1.af767a741088bp-1,
- 0x1.b11c3c79bb424p-1,
- 0x1.b2bb679ead19cp-1,
- 0x1.b4540978921eep-1,
- 0x1.b5e62fce16095p-1,
- 0x1.b771e894d602ep-1,
- 0x1.b8f741ef54f83p-1,
- 0x1.ba764a2af2b78p-1,
- 0x1.bbef0fbde6221p-1,
- 0x1.bd61a1453ab44p-1,
- 0x1.bece0d82d1a5cp-1,
- 0x1.c034635b66e23p-1,
- 0x1.c194b1d49a184p-1,
- 0x1.c2ef0812fc1bdp-1,
- 0x1.c443755820d64p-1,
- 0x1.c5920900b5fd1p-1,
- 0x1.c6dad2829ec62p-1,
- 0x1.c81de16b14cefp-1,
- 0x1.c95b455cce69dp-1,
- 0x1.ca930e0e2a825p-1,
- 0x1.cbc54b476248dp-1,
- 0x1.ccf20ce0c0d27p-1,
- 0x1.ce1962c0e0d8bp-1,
- 0x1.cf3b5cdaf0c39p-1,
- 0x1.d0580b2cfd249p-1,
- 0x1.d16f7dbe41ca0p-1,
- 0x1.d281c49d818d0p-1,
- 0x1.d38eefdf64fddp-1,
- 0x1.d4970f9ce00d9p-1,
- 0x1.d59a33f19ed42p-1,
- 0x1.d6986cfa798e7p-1,
- 0x1.d791cad3eff01p-1,
- 0x1.d8865d98abe01p-1,
- 0x1.d97635600bb89p-1,
- 0x1.da61623cb41e0p-1,
- 0x1.db47f43b2980dp-1,
- 0x1.dc29fb60715afp-1,
- 0x1.dd0787a8bb39dp-1,
- 0x1.dde0a90611a0dp-1,
- 0x1.deb56f5f12d28p-1,
- 0x1.df85ea8db188ep-1,
- 0x1.e0522a5dfda73p-1,
- 0x1.e11a3e8cf4eb8p-1,
- 0x1.e1de36c75ba58p-1,
- 0x1.e29e22a89d766p-1,
- 0x1.e35a11b9b61cep-1,
- 0x1.e4121370224ccp-1,
- 0x1.e4c6372cd8927p-1,
- 0x1.e5768c3b4a3fcp-1,
- 0x1.e62321d06c5e0p-1,
- 0x1.e6cc0709c8a0dp-1,
- 0x1.e7714aec96534p-1,
- 0x1.e812fc64db369p-1,
- 0x1.e8b12a44944a8p-1,
- 0x1.e94be342e6743p-1,
- 0x1.e9e335fb56f87p-1,
- 0x1.ea7730ed0bbb9p-1,
- 0x1.eb07e27a133aap-1,
- 0x1.eb9558e6b42cep-1,
- 0x1.ec1fa258c4beap-1,
- 0x1.eca6ccd709544p-1,
- 0x1.ed2ae6489ac1ep-1,
- 0x1.edabfc7453e63p-1,
- 0x1.ee2a1d004692cp-1,
- 0x1.eea5557137ae0p-1,
- 0x1.ef1db32a2277cp-1,
- 0x1.ef93436bc2daap-1,
- 0x1.f006135426b26p-1,
- 0x1.f0762fde45ee6p-1,
- 0x1.f0e3a5e1a1788p-1,
- 0x1.f14e8211e8c55p-1,
- 0x1.f1b6d0fea5f4dp-1,
- 0x1.f21c9f12f0677p-1,
- 0x1.f27ff89525acfp-1,
- 0x1.f2e0e9a6a8b09p-1,
- 0x1.f33f7e43a706bp-1,
- 0x1.f39bc242e43e6p-1,
- 0x1.f3f5c1558b19ep-1,
- 0x1.f44d870704911p-1,
- 0x1.f4a31ebcd47dfp-1,
- 0x1.f4f693b67bd77p-1,
- 0x1.f547f10d60597p-1,
- 0x1.f59741b4b97cfp-1,
- 0x1.f5e4907982a07p-1,
- 0x1.f62fe80272419p-1,
- 0x1.f67952cff6282p-1,
- 0x1.f6c0db3c34641p-1,
- 0x1.f7068b7b10fd9p-1,
- 0x1.f74a6d9a38383p-1,
- 0x1.f78c8b812d498p-1,
- 0x1.f7cceef15d631p-1,
- 0x1.f80ba18636f07p-1,
- 0x1.f848acb544e95p-1,
- 0x1.f88419ce4e184p-1,
- 0x1.f8bdf1fb78370p-1,
- 0x1.f8f63e416ebffp-1,
- 0x1.f92d077f8d56dp-1,
- 0x1.f96256700da8ep-1,
- 0x1.f99633a838a57p-1,
- 0x1.f9c8a7989af0dp-1,
- 0x1.f9f9ba8d3c733p-1,
- 0x1.fa2974addae45p-1,
- 0x1.fa57ddfe27376p-1,
- 0x1.fa84fe5e05c8dp-1,
- 0x1.fab0dd89d1309p-1,
- 0x1.fadb831a9f9c3p-1,
- 0x1.fb04f6868a944p-1,
- 0x1.fb2d3f20f9101p-1,
- 0x1.fb54641aebbc9p-1,
- 0x1.fb7a6c834b5a2p-1,
- 0x1.fb9f5f4739170p-1,
- 0x1.fbc3433260ca5p-1,
- 0x1.fbe61eef4cf6ap-1,
- 0x1.fc07f907bc794p-1,
- 0x1.fc28d7e4f9cd0p-1,
- 0x1.fc48c1d033c7ap-1,
- 0x1.fc67bcf2d7b8fp-1,
- 0x1.fc85cf56ecd38p-1,
- 0x1.fca2fee770c79p-1,
- 0x1.fcbf5170b578bp-1,
- 0x1.fcdacca0bfb73p-1,
- 0x1.fcf57607a6e7cp-1,
- 0x1.fd0f5317f582fp-1,
- 0x1.fd2869270a56fp-1,
- 0x1.fd40bd6d7a785p-1,
- 0x1.fd58550773cb5p-1,
- 0x1.fd6f34f52013ap-1,
- 0x1.fd85621b0876dp-1,
- 0x1.fd9ae142795e3p-1,
- 0x1.fdafb719e6a69p-1,
- 0x1.fdc3e835500b3p-1,
- 0x1.fdd7790ea5bc0p-1,
- 0x1.fdea6e062d0c9p-1,
- 0x1.fdfccb62e52d3p-1,
- 0x1.fe0e9552ebdd6p-1,
- 0x1.fe1fcfebe2083p-1,
- 0x1.fe307f2b503d0p-1,
- 0x1.fe40a6f70af4bp-1,
- 0x1.fe504b1d9696cp-1,
- 0x1.fe5f6f568b301p-1,
- 0x1.fe6e1742f7cf6p-1,
- 0x1.fe7c466dc57a1p-1,
- 0x1.fe8a004c19ae6p-1,
- 0x1.fe97483db8670p-1,
- 0x1.fea4218d6594ap-1,
- 0x1.feb08f7146046p-1,
- 0x1.febc950b3fa75p-1,
- 0x1.fec835695932ep-1,
- 0x1.fed37386190fbp-1,
- 0x1.fede5248e38f4p-1,
- 0x1.fee8d486585eep-1,
- 0x1.fef2fd00af31ap-1,
- 0x1.fefcce6813974p-1,
- 0x1.ff064b5afffbep-1,
- 0x1.ff0f766697c76p-1,
- 0x1.ff18520700971p-1,
- 0x1.ff20e0a7ba8c2p-1,
- 0x1.ff2924a3f7a83p-1,
- 0x1.ff312046f2339p-1,
- 0x1.ff38d5cc4227fp-1,
- 0x1.ff404760319b4p-1,
- 0x1.ff47772010262p-1,
- 0x1.ff4e671a85425p-1,
- 0x1.ff55194fe19dfp-1,
- 0x1.ff5b8fb26f5f6p-1,
- 0x1.ff61cc26c1578p-1,
- 0x1.ff67d08401202p-1,
- 0x1.ff6d9e943c231p-1,
- 0x1.ff733814af88cp-1,
- 0x1.ff789eb6130c9p-1,
- 0x1.ff7dd41ce2b4dp-1,
- 0x1.ff82d9e1a76d8p-1,
- 0x1.ff87b1913e853p-1,
- 0x1.ff8c5cad200a5p-1,
- 0x1.ff90dcaba4096p-1,
- 0x1.ff9532f846ab0p-1,
- 0x1.ff9960f3eb327p-1,
- 0x1.ff9d67f51ddbap-1,
- 0x1.ffa14948549a7p-1,
- 0x1.ffa506302ebaep-1,
- 0x1.ffa89fe5b3625p-1,
- 0x1.ffac17988ef4bp-1,
- 0x1.ffaf6e6f4f5c0p-1,
- 0x1.ffb2a5879f35ep-1,
- 0x1.ffb5bdf67fe6fp-1,
- 0x1.ffb8b8c88295fp-1,
- 0x1.ffbb970200110p-1,
- 0x1.ffbe599f4f9d9p-1,
- 0x1.ffc10194fcb64p-1,
- 0x1.ffc38fcffbb7cp-1,
- 0x1.ffc60535dd7f5p-1,
- 0x1.ffc862a501fd7p-1,
- 0x1.ffcaa8f4c9beap-1,
- 0x1.ffccd8f5c66d1p-1,
- 0x1.ffcef371ea4d7p-1,
- 0x1.ffd0f92cb6ba7p-1,
- 0x1.ffd2eae369a07p-1,
- 0x1.ffd4c94d29fdbp-1,
- 0x1.ffd6951b33686p-1,
- 0x1.ffd84ef9009eep-1,
- 0x1.ffd9f78c7524ap-1,
- 0x1.ffdb8f7605ee7p-1,
- 0x1.ffdd1750e1220p-1,
- 0x1.ffde8fb314ebfp-1,
- 0x1.ffdff92db56e5p-1,
- 0x1.ffe1544d01ccbp-1,
- 0x1.ffe2a1988857cp-1,
- 0x1.ffe3e19349dc7p-1,
- 0x1.ffe514bbdc197p-1,
- 0x1.ffe63b8c8b5f7p-1,
- 0x1.ffe7567b7b5e1p-1,
- 0x1.ffe865fac722bp-1,
- 0x1.ffe96a78a04a9p-1,
- 0x1.ffea645f6d6dap-1,
- 0x1.ffeb5415e7c44p-1,
- 0x1.ffec39ff380b9p-1,
- 0x1.ffed167b12ac2p-1,
- 0x1.ffede9e5d3262p-1,
- 0x1.ffeeb49896c6dp-1,
- 0x1.ffef76e956a9fp-1,
- 0x1.fff0312b010b5p-1,
- 0x1.fff0e3ad91ec2p-1,
- 0x1.fff18ebe2b0e1p-1,
- 0x1.fff232a72b48ep-1,
- 0x1.fff2cfb0453d9p-1,
- 0x1.fff3661e9569dp-1,
- 0x1.fff3f634b79f9p-1,
- 0x1.fff48032dbe40p-1,
- 0x1.fff50456dab8cp-1,
- 0x1.fff582dc48d30p-1,
- 0x1.fff5fbfc8a439p-1,
- 0x1.fff66feee5129p-1,
- 0x1.fff6dee89352ep-1,
- 0x1.fff7491cd4af6p-1,
- 0x1.fff7aebcff755p-1,
- 0x1.fff80ff8911fdp-1,
- 0x1.fff86cfd3e657p-1,
- 0x1.fff8c5f702ccfp-1,
- 0x1.fff91b102fca8p-1,
- 0x1.fff96c717b695p-1,
- 0x1.fff9ba420e834p-1,
- 0x1.fffa04a7928b1p-1,
- 0x1.fffa4bc63ee9ap-1,
- 0x1.fffa8fc0e5f33p-1,
- 0x1.fffad0b901755p-1,
- 0x1.fffb0ecebee1bp-1,
- 0x1.fffb4a210b172p-1,
- 0x1.fffb82cd9dcbfp-1,
- 0x1.fffbb8f1049c6p-1,
- 0x1.fffbeca6adbe9p-1,
- 0x1.fffc1e08f25f5p-1,
- 0x1.fffc4d3120aa1p-1,
- 0x1.fffc7a37857d2p-1,
- 0x1.fffca53375ce3p-1,
- 0x1.fffcce3b57bffp-1,
- 0x1.fffcf564ab6b7p-1,
- 0x1.fffd1ac4135f9p-1,
- 0x1.fffd3e6d5cd87p-1,
- 0x1.fffd607387b07p-1,
- 0x1.fffd80e8ce0dap-1,
- 0x1.fffd9fdeabccep-1,
- 0x1.fffdbd65e5ad0p-1,
- 0x1.fffdd98e903b2p-1,
- 0x1.fffdf46816833p-1,
- 0x1.fffe0e0140857p-1,
- 0x1.fffe26683972ap-1,
- 0x1.fffe3daa95b18p-1,
- 0x1.fffe53d558ae9p-1,
- 0x1.fffe68f4fa777p-1,
- 0x1.fffe7d156d244p-1,
- 0x1.fffe904222101p-1,
- 0x1.fffea2860ee1ep-1,
- 0x1.fffeb3ebb267bp-1,
- 0x1.fffec47d19457p-1,
- 0x1.fffed443e2787p-1,
- 0x1.fffee34943b15p-1,
- 0x1.fffef1960d85dp-1,
- 0x1.fffeff32af7afp-1,
- 0x1.ffff0c273bea2p-1,
- 0x1.ffff187b6bc0ep-1,
- 0x1.ffff2436a21dcp-1,
- 0x1.ffff2f5fefcaap-1,
- 0x1.ffff39fe16963p-1,
- 0x1.ffff44178c8d2p-1,
- 0x1.ffff4db27f146p-1,
- 0x1.ffff56d4d5e5ep-1,
- 0x1.ffff5f8435efcp-1,
- 0x1.ffff67c604180p-1,
- 0x1.ffff6f9f67e55p-1,
- 0x1.ffff77154e0d6p-1,
- 0x1.ffff7e2c6aea2p-1,
- 0x1.ffff84e93cd75p-1,
- 0x1.ffff8b500e77cp-1,
- 0x1.ffff9164f8e46p-1,
- 0x1.ffff972be5c59p-1,
- 0x1.ffff9ca891572p-1,
- 0x1.ffffa1de8c582p-1,
- 0x1.ffffa6d13de73p-1,
- 0x1.ffffab83e54b8p-1,
- 0x1.ffffaff99bac4p-1,
- 0x1.ffffb43555b5fp-1,
- 0x1.ffffb839e52f3p-1,
- 0x1.ffffbc09fa7cdp-1,
- 0x1.ffffbfa82616bp-1,
- 0x1.ffffc316d9ed0p-1,
- 0x1.ffffc6586abf6p-1,
- 0x1.ffffc96f1165ep-1,
- 0x1.ffffcc5cec0c1p-1,
- 0x1.ffffcf23ff5fcp-1,
- 0x1.ffffd1c637b2bp-1,
- 0x1.ffffd4456a10dp-1,
- 0x1.ffffd6a3554a1p-1,
- 0x1.ffffd8e1a2f22p-1,
- 0x1.ffffdb01e8546p-1,
- 0x1.ffffdd05a75eap-1,
- 0x1.ffffdeee4f810p-1,
- 0x1.ffffe0bd3e852p-1,
- 0x1.ffffe273c15b7p-1,
- 0x1.ffffe41314e06p-1,
- 0x1.ffffe59c6698bp-1,
- 0x1.ffffe710d565ep-1,
- 0x1.ffffe8717232dp-1,
- 0x1.ffffe9bf4098cp-1,
- 0x1.ffffeafb377d5p-1,
- 0x1.ffffec2641a9ep-1,
- 0x1.ffffed413e5b7p-1,
- 0x1.ffffee4d01cd6p-1,
- 0x1.ffffef4a55bd4p-1,
- 0x1.fffff039f9e8fp-1,
- 0x1.fffff11ca4876p-1,
- 0x1.fffff1f302bc1p-1,
- 0x1.fffff2bdb904dp-1,
- 0x1.fffff37d63a36p-1,
- 0x1.fffff43297019p-1,
- 0x1.fffff4dde0118p-1,
- 0x1.fffff57fc4a95p-1,
- 0x1.fffff618c3da6p-1,
- 0x1.fffff6a956450p-1,
- 0x1.fffff731ee681p-1,
- 0x1.fffff7b2f8ed6p-1,
- 0x1.fffff82cdcf1bp-1,
- 0x1.fffff89ffc4aap-1,
- 0x1.fffff90cb3c81p-1,
- 0x1.fffff9735b73bp-1,
- 0x1.fffff9d446cccp-1,
- 0x1.fffffa2fc5015p-1,
- 0x1.fffffa8621251p-1,
- 0x1.fffffad7a2652p-1,
- 0x1.fffffb248c39dp-1,
- 0x1.fffffb6d1e95dp-1,
- 0x1.fffffbb196132p-1,
- 0x1.fffffbf22c1e2p-1,
- 0x1.fffffc2f171e3p-1,
- 0x1.fffffc688a9cfp-1,
- 0x1.fffffc9eb76acp-1,
- 0x1.fffffcd1cbc28p-1,
- 0x1.fffffd01f36afp-1,
- 0x1.fffffd2f57d68p-1,
- 0x1.fffffd5a2041fp-1,
- 0x1.fffffd8271d12p-1,
- 0x1.fffffda86faa9p-1,
- 0x1.fffffdcc3b117p-1,
- 0x1.fffffdedf37edp-1,
- 0x1.fffffe0db6b91p-1,
- 0x1.fffffe2ba0ea5p-1,
- 0x1.fffffe47ccb60p-1,
- 0x1.fffffe62534d4p-1,
- 0x1.fffffe7b4c81ep-1,
- 0x1.fffffe92ced93p-1,
- 0x1.fffffea8ef9cfp-1,
- 0x1.fffffebdc2ec6p-1,
- 0x1.fffffed15bcbap-1,
- 0x1.fffffee3cc32cp-1,
- 0x1.fffffef5251c2p-1,
- 0x1.ffffff0576917p-1,
- 0x1.ffffff14cfb92p-1,
- 0x1.ffffff233ee1dp-1,
- 0x1.ffffff30d18e8p-1,
- 0x1.ffffff3d9480fp-1,
- 0x1.ffffff4993c46p-1,
- 0x1.ffffff54dab72p-1,
- 0x1.ffffff5f74141p-1,
- 0x1.ffffff6969fb8p-1,
- 0x1.ffffff72c5fb6p-1,
- 0x1.ffffff7b91176p-1,
- 0x1.ffffff83d3d07p-1,
- 0x1.ffffff8b962bep-1,
- 0x1.ffffff92dfba2p-1,
- 0x1.ffffff99b79d2p-1,
- 0x1.ffffffa0248e8p-1,
- 0x1.ffffffa62ce54p-1,
- 0x1.ffffffabd69b4p-1,
- 0x1.ffffffb127525p-1,
- 0x1.ffffffb624592p-1,
- 0x1.ffffffbad2affp-1,
- 0x1.ffffffbf370cdp-1,
- 0x1.ffffffc355dfdp-1,
- 0x1.ffffffc733572p-1,
- 0x1.ffffffcad3626p-1,
- 0x1.ffffffce39b67p-1,
- 0x1.ffffffd169d0cp-1,
- 0x1.ffffffd466fa5p-1,
- 0x1.ffffffd7344aap-1,
- 0x1.ffffffd9d4aabp-1,
- 0x1.ffffffdc4ad7ap-1,
- 0x1.ffffffde9964ep-1,
- 0x1.ffffffe0c2bf0p-1,
- 0x1.ffffffe2c92dbp-1,
- 0x1.ffffffe4aed5ep-1,
- 0x1.ffffffe675bbdp-1,
- 0x1.ffffffe81fc4ep-1,
- 0x1.ffffffe9aeb97p-1,
- 0x1.ffffffeb24467p-1,
- 0x1.ffffffec81ff2p-1,
- 0x1.ffffffedc95e7p-1,
- 0x1.ffffffeefbc85p-1,
- 0x1.fffffff01a8b6p-1,
- 0x1.fffffff126e1ep-1,
- 0x1.fffffff221f30p-1,
- 0x1.fffffff30cd3fp-1,
- 0x1.fffffff3e8892p-1,
- 0x1.fffffff4b606fp-1,
- 0x1.fffffff57632dp-1,
- 0x1.fffffff629e44p-1,
- 0x1.fffffff6d1e56p-1,
- 0x1.fffffff76ef3fp-1,
- 0x1.fffffff801c1fp-1,
- 0x1.fffffff88af67p-1,
- 0x1.fffffff90b2e3p-1,
- 0x1.fffffff982fc1p-1,
- 0x1.fffffff9f2e9fp-1,
- 0x1.fffffffa5b790p-1,
- 0x1.fffffffabd229p-1,
- 0x1.fffffffb18582p-1,
- 0x1.fffffffb6d844p-1,
- 0x1.fffffffbbd0aap-1,
- 0x1.fffffffc0748fp-1,
- 0x1.fffffffc4c96cp-1,
- 0x1.fffffffc8d462p-1,
- 0x1.fffffffcc9a41p-1,
- 0x1.fffffffd01f89p-1,
- 0x1.fffffffd36871p-1,
- 0x1.fffffffd678edp-1,
- 0x1.fffffffd954aep-1,
- 0x1.fffffffdbff2ap-1,
- 0x1.fffffffde7ba0p-1,
- 0x1.fffffffe0cd16p-1,
- 0x1.fffffffe2f664p-1,
- 0x1.fffffffe4fa30p-1,
- 0x1.fffffffe6daf7p-1,
- 0x1.fffffffe89b0cp-1,
- 0x1.fffffffea3c9ap-1,
- 0x1.fffffffebc1a9p-1,
- 0x1.fffffffed2c21p-1,
- 0x1.fffffffee7dc8p-1,
- 0x1.fffffffefb847p-1,
- 0x1.ffffffff0dd2bp-1,
- 0x1.ffffffff1ede9p-1,
- 0x1.ffffffff2ebdap-1,
- 0x1.ffffffff3d843p-1,
- 0x1.ffffffff4b453p-1,
- 0x1.ffffffff58126p-1,
- 0x1.ffffffff63fc3p-1,
- 0x1.ffffffff6f121p-1,
- 0x1.ffffffff79626p-1,
- 0x1.ffffffff82fabp-1,
- 0x1.ffffffff8be77p-1,
- 0x1.ffffffff94346p-1,
- 0x1.ffffffff9bec8p-1,
- 0x1.ffffffffa319fp-1,
- 0x1.ffffffffa9c63p-1,
- 0x1.ffffffffaffa4p-1,
- 0x1.ffffffffb5be5p-1,
- 0x1.ffffffffbb1a2p-1,
- 0x1.ffffffffc014ep-1,
- 0x1.ffffffffc4b56p-1,
- 0x1.ffffffffc901cp-1,
- 0x1.ffffffffccfffp-1,
- 0x1.ffffffffd0b56p-1,
- 0x1.ffffffffd4271p-1,
- 0x1.ffffffffd759dp-1,
- 0x1.ffffffffda520p-1,
- 0x1.ffffffffdd13cp-1,
- 0x1.ffffffffdfa2dp-1,
- 0x1.ffffffffe202dp-1,
- 0x1.ffffffffe4371p-1,
- 0x1.ffffffffe642ap-1,
- 0x1.ffffffffe8286p-1,
- 0x1.ffffffffe9eb0p-1,
- 0x1.ffffffffeb8d0p-1,
- 0x1.ffffffffed10ap-1,
- 0x1.ffffffffee782p-1,
- 0x1.ffffffffefc57p-1,
- 0x1.fffffffff0fa7p-1,
- 0x1.fffffffff218fp-1,
- 0x1.fffffffff3227p-1,
- 0x1.fffffffff4188p-1,
- 0x1.fffffffff4fc9p-1,
- 0x1.fffffffff5cfdp-1,
- 0x1.fffffffff6939p-1,
- 0x1.fffffffff748ep-1,
- 0x1.fffffffff7f0dp-1,
- 0x1.fffffffff88c5p-1,
- 0x1.fffffffff91c6p-1,
- 0x1.fffffffff9a1bp-1,
- 0x1.fffffffffa1d2p-1,
- 0x1.fffffffffa8f6p-1,
- 0x1.fffffffffaf92p-1,
- 0x1.fffffffffb5b0p-1,
- 0x1.fffffffffbb58p-1,
- 0x1.fffffffffc095p-1,
- 0x1.fffffffffc56dp-1,
- 0x1.fffffffffc9e8p-1,
- 0x1.fffffffffce0dp-1,
- 0x1.fffffffffd1e1p-1,
- 0x1.fffffffffd56cp-1,
- 0x1.fffffffffd8b3p-1,
- 0x1.fffffffffdbbap-1,
- 0x1.fffffffffde86p-1,
- 0x1.fffffffffe11dp-1,
- 0x1.fffffffffe380p-1,
- 0x1.fffffffffe5b6p-1,
- 0x1.fffffffffe7c0p-1,
- 0x1.fffffffffe9a2p-1,
- 0x1.fffffffffeb60p-1,
- 0x1.fffffffffecfbp-1,
- 0x1.fffffffffee77p-1,
- 0x1.fffffffffefd6p-1,
- 0x1.ffffffffff11ap-1,
- 0x1.ffffffffff245p-1,
- 0x1.ffffffffff359p-1,
- 0x1.ffffffffff457p-1,
- 0x1.ffffffffff542p-1,
- 0x1.ffffffffff61bp-1,
- 0x1.ffffffffff6e3p-1,
- 0x1.ffffffffff79bp-1,
- 0x1.ffffffffff845p-1,
- 0x1.ffffffffff8e2p-1,
- 0x1.ffffffffff973p-1,
- 0x1.ffffffffff9f8p-1,
- 0x1.ffffffffffa73p-1,
- 0x1.ffffffffffae4p-1,
- 0x1.ffffffffffb4cp-1,
- 0x1.ffffffffffbadp-1,
- 0x1.ffffffffffc05p-1,
- 0x1.ffffffffffc57p-1,
- 0x1.ffffffffffca2p-1,
- 0x1.ffffffffffce7p-1,
- 0x1.ffffffffffd27p-1,
- 0x1.ffffffffffd62p-1,
- 0x1.ffffffffffd98p-1,
- 0x1.ffffffffffdcap-1,
- 0x1.ffffffffffdf8p-1,
- 0x1.ffffffffffe22p-1,
- 0x1.ffffffffffe49p-1,
- 0x1.ffffffffffe6cp-1,
- 0x1.ffffffffffe8dp-1,
- 0x1.ffffffffffeabp-1,
- 0x1.ffffffffffec7p-1,
- 0x1.ffffffffffee1p-1,
- 0x1.ffffffffffef8p-1,
- 0x1.fffffffffff0ep-1,
- 0x1.fffffffffff22p-1,
- 0x1.fffffffffff34p-1,
- 0x1.fffffffffff45p-1,
- 0x1.fffffffffff54p-1,
- 0x1.fffffffffff62p-1,
- 0x1.fffffffffff6fp-1,
- 0x1.fffffffffff7bp-1,
- 0x1.fffffffffff86p-1,
- 0x1.fffffffffff90p-1,
- 0x1.fffffffffff9ap-1,
- 0x1.fffffffffffa2p-1,
- 0x1.fffffffffffaap-1,
- 0x1.fffffffffffb1p-1,
- 0x1.fffffffffffb8p-1,
- 0x1.fffffffffffbep-1,
- 0x1.fffffffffffc3p-1,
- 0x1.fffffffffffc8p-1,
- 0x1.fffffffffffcdp-1,
- 0x1.fffffffffffd1p-1,
- 0x1.fffffffffffd5p-1,
- 0x1.fffffffffffd9p-1,
- 0x1.fffffffffffdcp-1,
- 0x1.fffffffffffdfp-1,
- 0x1.fffffffffffe2p-1,
- 0x1.fffffffffffe4p-1,
- 0x1.fffffffffffe7p-1,
- 0x1.fffffffffffe9p-1,
- 0x1.fffffffffffebp-1,
- 0x1.fffffffffffedp-1,
- 0x1.fffffffffffeep-1,
- 0x1.ffffffffffff0p-1,
- 0x1.ffffffffffff1p-1,
- 0x1.ffffffffffff3p-1,
- 0x1.ffffffffffff4p-1,
- 0x1.ffffffffffff5p-1,
- 0x1.ffffffffffff6p-1,
- 0x1.ffffffffffff7p-1,
- 0x1.ffffffffffff7p-1,
- 0x1.ffffffffffff8p-1,
- 0x1.ffffffffffff9p-1,
- 0x1.ffffffffffff9p-1,
- 0x1.ffffffffffffap-1,
- 0x1.ffffffffffffbp-1,
- 0x1.ffffffffffffbp-1,
- 0x1.ffffffffffffbp-1,
- 0x1.ffffffffffffcp-1,
- 0x1.ffffffffffffcp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- },
- .scale = { 0x1.20dd750429b6dp+0,
- 0x1.20d8f1975c85dp+0,
- 0x1.20cb67bd452c7p+0,
- 0x1.20b4d8bac36c1p+0,
- 0x1.209546ad13ccfp+0,
- 0x1.206cb4897b148p+0,
- 0x1.203b261cd0052p+0,
- 0x1.2000a00ae3804p+0,
- 0x1.1fbd27cdc72d3p+0,
- 0x1.1f70c3b4f2cc7p+0,
- 0x1.1f1b7ae44867fp+0,
- 0x1.1ebd5552f795bp+0,
- 0x1.1e565bca400d4p+0,
- 0x1.1de697e413d28p+0,
- 0x1.1d6e14099944ap+0,
- 0x1.1cecdb718d61cp+0,
- 0x1.1c62fa1e869b6p+0,
- 0x1.1bd07cdd189acp+0,
- 0x1.1b357141d95d5p+0,
- 0x1.1a91e5a748165p+0,
- 0x1.19e5e92b964abp+0,
- 0x1.19318bae53a04p+0,
- 0x1.1874ddcdfce24p+0,
- 0x1.17aff0e56ec10p+0,
- 0x1.16e2d7093cd8cp+0,
- 0x1.160da304ed92fp+0,
- 0x1.153068581b781p+0,
- 0x1.144b3b337c90cp+0,
- 0x1.135e3075d076bp+0,
- 0x1.12695da8b5bdep+0,
- 0x1.116cd8fd67618p+0,
- 0x1.1068b94962e5ep+0,
- 0x1.0f5d1602f7e41p+0,
- 0x1.0e4a073dc1b91p+0,
- 0x1.0d2fa5a70c168p+0,
- 0x1.0c0e0a8223359p+0,
- 0x1.0ae54fa490722p+0,
- 0x1.09b58f724416bp+0,
- 0x1.087ee4d9ad247p+0,
- 0x1.07416b4fbfe7cp+0,
- 0x1.05fd3ecbec297p+0,
- 0x1.04b27bc403d30p+0,
- 0x1.03613f2812dafp+0,
- 0x1.0209a65e29545p+0,
- 0x1.00abcf3e187a9p+0,
- 0x1.fe8fb01a47307p-1,
- 0x1.fbbbbef34b4b2p-1,
- 0x1.f8dc092d58ff8p-1,
- 0x1.f5f0cdaf15313p-1,
- 0x1.f2fa4c16c0019p-1,
- 0x1.eff8c4b1375dbp-1,
- 0x1.ecec7870ebca7p-1,
- 0x1.e9d5a8e4c934ep-1,
- 0x1.e6b4982f158b9p-1,
- 0x1.e38988fc46e72p-1,
- 0x1.e054be79d3042p-1,
- 0x1.dd167c4cf9d2ap-1,
- 0x1.d9cf06898cdafp-1,
- 0x1.d67ea1a8b5368p-1,
- 0x1.d325927fb9d89p-1,
- 0x1.cfc41e36c7df9p-1,
- 0x1.cc5a8a3fbea40p-1,
- 0x1.c8e91c4d01368p-1,
- 0x1.c5701a484ef9dp-1,
- 0x1.c1efca49a5011p-1,
- 0x1.be68728e29d5dp-1,
- 0x1.bada596f25436p-1,
- 0x1.b745c55905bf8p-1,
- 0x1.b3aafcc27502ep-1,
- 0x1.b00a46237d5bep-1,
- 0x1.ac63e7ecc1411p-1,
- 0x1.a8b8287ec6a09p-1,
- 0x1.a5074e2157620p-1,
- 0x1.a1519efaf889ep-1,
- 0x1.9d97610879642p-1,
- 0x1.99d8da149c13fp-1,
- 0x1.96164fafd8de3p-1,
- 0x1.925007283d7aap-1,
- 0x1.8e86458169af8p-1,
- 0x1.8ab94f6caa71dp-1,
- 0x1.86e9694134b9ep-1,
- 0x1.8316d6f48133dp-1,
- 0x1.7f41dc12c9e89p-1,
- 0x1.7b6abbb7aaf19p-1,
- 0x1.7791b886e7403p-1,
- 0x1.73b714a552763p-1,
- 0x1.6fdb11b1e0c34p-1,
- 0x1.6bfdf0beddaf5p-1,
- 0x1.681ff24b4ab04p-1,
- 0x1.6441563c665d4p-1,
- 0x1.60625bd75d07bp-1,
- 0x1.5c8341bb23767p-1,
- 0x1.58a445da7c74cp-1,
- 0x1.54c5a57629db0p-1,
- 0x1.50e79d1749ac9p-1,
- 0x1.4d0a6889dfd9fp-1,
- 0x1.492e42d78d2c5p-1,
- 0x1.4553664273d24p-1,
- 0x1.417a0c4049fd0p-1,
- 0x1.3da26d759aef5p-1,
- 0x1.39ccc1b136d5ap-1,
- 0x1.35f93fe7d1b3dp-1,
- 0x1.32281e2fd1a92p-1,
- 0x1.2e5991bd4cbfcp-1,
- 0x1.2a8dcede3673bp-1,
- 0x1.26c508f6bd0ffp-1,
- 0x1.22ff727dd6f7bp-1,
- 0x1.1f3d3cf9ffe5ap-1,
- 0x1.1b7e98fe26217p-1,
- 0x1.17c3b626c7a11p-1,
- 0x1.140cc3173f007p-1,
- 0x1.1059ed7740313p-1,
- 0x1.0cab61f084b93p-1,
- 0x1.09014c2ca74dap-1,
- 0x1.055bd6d32e8d7p-1,
- 0x1.01bb2b87c6968p-1,
- 0x1.fc3ee5d1524b0p-2,
- 0x1.f511a91a67d2ap-2,
- 0x1.edeeee0959518p-2,
- 0x1.e6d6ffaa65a25p-2,
- 0x1.dfca26f5bbf88p-2,
- 0x1.d8c8aace11e63p-2,
- 0x1.d1d2cfff91594p-2,
- 0x1.cae8d93f1d7b6p-2,
- 0x1.c40b0729ed547p-2,
- 0x1.bd3998457afdap-2,
- 0x1.b674c8ffc6283p-2,
- 0x1.afbcd3afe8ab6p-2,
- 0x1.a911f096fbc26p-2,
- 0x1.a27455e14c93cp-2,
- 0x1.9be437a7de946p-2,
- 0x1.9561c7f23a47bp-2,
- 0x1.8eed36b886d93p-2,
- 0x1.8886b1e5ecfd1p-2,
- 0x1.822e655b417e6p-2,
- 0x1.7be47af1f5d89p-2,
- 0x1.75a91a7f4d2edp-2,
- 0x1.6f7c69d7d3ef8p-2,
- 0x1.695e8cd31867ep-2,
- 0x1.634fa54fa285fp-2,
- 0x1.5d4fd33729015p-2,
- 0x1.575f3483021c3p-2,
- 0x1.517de540ce2a3p-2,
- 0x1.4babff975a04cp-2,
- 0x1.45e99bcbb7915p-2,
- 0x1.4036d0468a7a2p-2,
- 0x1.3a93b1998736cp-2,
- 0x1.35005285227f1p-2,
- 0x1.2f7cc3fe6f423p-2,
- 0x1.2a09153529381p-2,
- 0x1.24a55399ea239p-2,
- 0x1.1f518ae487dc8p-2,
- 0x1.1a0dc51a9934dp-2,
- 0x1.14da0a961fd14p-2,
- 0x1.0fb6620c550afp-2,
- 0x1.0aa2d09497f2bp-2,
- 0x1.059f59af7a906p-2,
- 0x1.00abff4dec7a3p-2,
- 0x1.f79183b101c5bp-3,
- 0x1.edeb406d9c824p-3,
- 0x1.e4652fadcb6b2p-3,
- 0x1.daff4969c0b04p-3,
- 0x1.d1b982c501370p-3,
- 0x1.c893ce1dcbef7p-3,
- 0x1.bf8e1b1ca2279p-3,
- 0x1.b6a856c3ed54fp-3,
- 0x1.ade26b7fbed95p-3,
- 0x1.a53c4135a6526p-3,
- 0x1.9cb5bd549b111p-3,
- 0x1.944ec2e4f5630p-3,
- 0x1.8c07329874652p-3,
- 0x1.83deeada4d25ap-3,
- 0x1.7bd5c7df3fe9cp-3,
- 0x1.73eba3b5b07b7p-3,
- 0x1.6c205655be71fp-3,
- 0x1.6473b5b15a7a1p-3,
- 0x1.5ce595c455b0ap-3,
- 0x1.5575c8a468361p-3,
- 0x1.4e241e912c305p-3,
- 0x1.46f066040a832p-3,
- 0x1.3fda6bc016994p-3,
- 0x1.38e1fae1d6a9dp-3,
- 0x1.3206dceef5f87p-3,
- 0x1.2b48d9e5dea1cp-3,
- 0x1.24a7b84d38971p-3,
- 0x1.1e233d434b813p-3,
- 0x1.17bb2c8d41535p-3,
- 0x1.116f48a6476ccp-3,
- 0x1.0b3f52ce8c383p-3,
- 0x1.052b0b1a174eap-3,
- 0x1.fe6460fef4680p-4,
- 0x1.f2a901ccafb37p-4,
- 0x1.e723726b824a9p-4,
- 0x1.dbd32ac4c99b0p-4,
- 0x1.d0b7a0f921e7cp-4,
- 0x1.c5d0497c09e74p-4,
- 0x1.bb1c972f23e50p-4,
- 0x1.b09bfb7d11a83p-4,
- 0x1.a64de673e8837p-4,
- 0x1.9c31c6df3b1b8p-4,
- 0x1.92470a61b6965p-4,
- 0x1.888d1d8e510a3p-4,
- 0x1.7f036c0107294p-4,
- 0x1.75a96077274bap-4,
- 0x1.6c7e64e7281cbp-4,
- 0x1.6381e2980956bp-4,
- 0x1.5ab342383d177p-4,
- 0x1.5211ebf41880bp-4,
- 0x1.499d478bca735p-4,
- 0x1.4154bc68d75c3p-4,
- 0x1.3937b1b319259p-4,
- 0x1.31458e6542847p-4,
- 0x1.297db960e4f63p-4,
- 0x1.21df9981f8e53p-4,
- 0x1.1a6a95b1e786fp-4,
- 0x1.131e14fa1625dp-4,
- 0x1.0bf97e95f2a64p-4,
- 0x1.04fc3a0481321p-4,
- 0x1.fc4b5e32d6259p-5,
- 0x1.eeea8c1b1db93p-5,
- 0x1.e1d4cf1e2450ap-5,
- 0x1.d508f9a1ea64ep-5,
- 0x1.c885df3451a07p-5,
- 0x1.bc4a54a84e834p-5,
- 0x1.b055303221015p-5,
- 0x1.a4a549829587ep-5,
- 0x1.993979e14fffdp-5,
- 0x1.8e109c4622913p-5,
- 0x1.83298d717210ep-5,
- 0x1.78832c03aa2b1p-5,
- 0x1.6e1c5893c380bp-5,
- 0x1.63f3f5c4de13bp-5,
- 0x1.5a08e85af27e0p-5,
- 0x1.505a174e9c929p-5,
- 0x1.46e66be002240p-5,
- 0x1.3dacd1a8d8ccdp-5,
- 0x1.34ac36ad8dafep-5,
- 0x1.2be38b6d92415p-5,
- 0x1.2351c2f2d1449p-5,
- 0x1.1af5d2e04f3f6p-5,
- 0x1.12ceb37ff9bc3p-5,
- 0x1.0adb5fcfa8c75p-5,
- 0x1.031ad58d56279p-5,
- 0x1.f7182a851bca2p-6,
- 0x1.e85c449e377f2p-6,
- 0x1.da0005e5f28dfp-6,
- 0x1.cc0180af00a8bp-6,
- 0x1.be5ecd2fcb5f9p-6,
- 0x1.b1160991ff737p-6,
- 0x1.a4255a00b9f03p-6,
- 0x1.978ae8b55ce1bp-6,
- 0x1.8b44e6031383ep-6,
- 0x1.7f5188610ddc8p-6,
- 0x1.73af0c737bb45p-6,
- 0x1.685bb5134ef13p-6,
- 0x1.5d55cb54cd53ap-6,
- 0x1.529b9e8cf9a1ep-6,
- 0x1.482b8455dc491p-6,
- 0x1.3e03d891b37dep-6,
- 0x1.3422fd6d12e2bp-6,
- 0x1.2a875b5ffab56p-6,
- 0x1.212f612dee7fbp-6,
- 0x1.181983e5133ddp-6,
- 0x1.0f443edc5ce49p-6,
- 0x1.06ae13b0d3255p-6,
- 0x1.fcab1483ea7fcp-7,
- 0x1.ec72615a894c4p-7,
- 0x1.dcaf3691fc448p-7,
- 0x1.cd5ec93c12431p-7,
- 0x1.be7e5ac24963bp-7,
- 0x1.b00b38d6b3575p-7,
- 0x1.a202bd6372dcep-7,
- 0x1.94624e78e0fafp-7,
- 0x1.87275e3a6869dp-7,
- 0x1.7a4f6aca256cbp-7,
- 0x1.6dd7fe3358230p-7,
- 0x1.61beae53b72b7p-7,
- 0x1.56011cc3b036dp-7,
- 0x1.4a9cf6bda3f4cp-7,
- 0x1.3f8ff5042a88ep-7,
- 0x1.34d7dbc76d7e5p-7,
- 0x1.2a727a89a3f14p-7,
- 0x1.205dac02bd6b9p-7,
- 0x1.1697560347b25p-7,
- 0x1.0d1d69569b82dp-7,
- 0x1.03ede1a45bfeep-7,
- 0x1.f60d8aa2a88f2p-8,
- 0x1.e4cc4abf7d065p-8,
- 0x1.d4143a9dfe965p-8,
- 0x1.c3e1a5f5c077cp-8,
- 0x1.b430ecf4a83a8p-8,
- 0x1.a4fe83fb9db25p-8,
- 0x1.9646f35a76623p-8,
- 0x1.8806d70b2fc36p-8,
- 0x1.7a3ade6c8b3e4p-8,
- 0x1.6cdfcbfc1e263p-8,
- 0x1.5ff2750fe7820p-8,
- 0x1.536fc18f7ce5cp-8,
- 0x1.4754abacdf1dcp-8,
- 0x1.3b9e3f9d06e3fp-8,
- 0x1.30499b503957fp-8,
- 0x1.2553ee2a336bfp-8,
- 0x1.1aba78ba3af89p-8,
- 0x1.107a8c7323a6ep-8,
- 0x1.06918b6355624p-8,
- 0x1.f9f9cfd9c3035p-9,
- 0x1.e77448fb66bb9p-9,
- 0x1.d58da68fd1170p-9,
- 0x1.c4412bf4b8f0bp-9,
- 0x1.b38a3af2e55b4p-9,
- 0x1.a3645330550ffp-9,
- 0x1.93cb11a30d765p-9,
- 0x1.84ba3004a50d0p-9,
- 0x1.762d84469c18fp-9,
- 0x1.6821000795a03p-9,
- 0x1.5a90b00981d93p-9,
- 0x1.4d78bba8ca5fdp-9,
- 0x1.40d564548fad7p-9,
- 0x1.34a305080681fp-9,
- 0x1.28de11c5031ebp-9,
- 0x1.1d83170fbf6fbp-9,
- 0x1.128eb96be8798p-9,
- 0x1.07fdb4dafea5fp-9,
- 0x1.fb99b8b8279e1p-10,
- 0x1.e7f232d9e2630p-10,
- 0x1.d4fed7195d7e8p-10,
- 0x1.c2b9cf7f893bfp-10,
- 0x1.b11d702b3deb1p-10,
- 0x1.a024365f771bdp-10,
- 0x1.8fc8c794b03b5p-10,
- 0x1.8005f08d6f1efp-10,
- 0x1.70d6a46e07ddap-10,
- 0x1.6235fbd7a4345p-10,
- 0x1.541f340697987p-10,
- 0x1.468dadf4080abp-10,
- 0x1.397ced7af2b15p-10,
- 0x1.2ce898809244ep-10,
- 0x1.20cc76202c5fap-10,
- 0x1.15246dda49d47p-10,
- 0x1.09ec86c75d497p-10,
- 0x1.fe41cd9bb4eeep-11,
- 0x1.e97ba3b77f306p-11,
- 0x1.d57f524723822p-11,
- 0x1.c245d4b998479p-11,
- 0x1.afc85e0f82e12p-11,
- 0x1.9e005769dbc1dp-11,
- 0x1.8ce75e9f6f8a0p-11,
- 0x1.7c7744d9378f7p-11,
- 0x1.6caa0d3582fe9p-11,
- 0x1.5d79eb71e893bp-11,
- 0x1.4ee1429bf7cc0p-11,
- 0x1.40daa3c89f5b6p-11,
- 0x1.3360ccd23db3ap-11,
- 0x1.266ea71d4f71ap-11,
- 0x1.19ff4663ae9dfp-11,
- 0x1.0e0de78654d1ep-11,
- 0x1.0295ef6591848p-11,
- 0x1.ef25d37f49fe1p-12,
- 0x1.da01102b5f851p-12,
- 0x1.c5b5412dcafadp-12,
- 0x1.b23a5a23e4210p-12,
- 0x1.9f8893d8fd1c1p-12,
- 0x1.8d986a4187285p-12,
- 0x1.7c629a822bc9ep-12,
- 0x1.6be02102b3520p-12,
- 0x1.5c0a378c90bcap-12,
- 0x1.4cda5374ea275p-12,
- 0x1.3e4a23d1f4702p-12,
- 0x1.30538fbb77ecdp-12,
- 0x1.22f0b496539bdp-12,
- 0x1.161be46ad3b50p-12,
- 0x1.09cfa445b00ffp-12,
- 0x1.fc0d55470cf51p-13,
- 0x1.e577bbcd49935p-13,
- 0x1.cfd4a5adec5bfp-13,
- 0x1.bb1a9657ce465p-13,
- 0x1.a740684026555p-13,
- 0x1.943d4a1d1ed39p-13,
- 0x1.8208bc334a6a5p-13,
- 0x1.709a8db59f25cp-13,
- 0x1.5feada379d8b7p-13,
- 0x1.4ff207314a102p-13,
- 0x1.40a8c1949f75ep-13,
- 0x1.3207fb7420eb9p-13,
- 0x1.2408e9ba3327fp-13,
- 0x1.16a501f0e42cap-13,
- 0x1.09d5f819c9e29p-13,
- 0x1.fb2b792b40a22p-14,
- 0x1.e3bcf436a1a95p-14,
- 0x1.cd55277c18d05p-14,
- 0x1.b7e94604479dcp-14,
- 0x1.a36eec00926ddp-14,
- 0x1.8fdc1b2dcf7b9p-14,
- 0x1.7d2737527c3f9p-14,
- 0x1.6b4702d7d5849p-14,
- 0x1.5a329b7d30748p-14,
- 0x1.49e17724f4d41p-14,
- 0x1.3a4b60ba9aa4dp-14,
- 0x1.2b6875310f785p-14,
- 0x1.1d312098e9dbap-14,
- 0x1.0f9e1b4dd36dfp-14,
- 0x1.02a8673a94691p-14,
- 0x1.ec929a665b449p-15,
- 0x1.d4f4b4c8e09edp-15,
- 0x1.be6abbb10a5aap-15,
- 0x1.a8e8cc1fadef6p-15,
- 0x1.94637d5bacfdbp-15,
- 0x1.80cfdc72220cfp-15,
- 0x1.6e2367dc27f95p-15,
- 0x1.5c540b4936fd2p-15,
- 0x1.4b581b8d170fcp-15,
- 0x1.3b2652b06c2b2p-15,
- 0x1.2bb5cc22e5db6p-15,
- 0x1.1cfe010e2052dp-15,
- 0x1.0ef6c4c84a0fep-15,
- 0x1.01984165a5f36p-15,
- 0x1.e9b5e8d00ce76p-16,
- 0x1.d16f5716c6c1ap-16,
- 0x1.ba4f035d60e02p-16,
- 0x1.a447b7b03f045p-16,
- 0x1.8f4ccca7fc90dp-16,
- 0x1.7b5223dac7336p-16,
- 0x1.684c227fcacefp-16,
- 0x1.562fac4329b48p-16,
- 0x1.44f21e49054f2p-16,
- 0x1.34894a5e24657p-16,
- 0x1.24eb7254ccf83p-16,
- 0x1.160f438c70913p-16,
- 0x1.07ebd2a2d2844p-16,
- 0x1.f4f12e9ab070ap-17,
- 0x1.db5ad0b27805cp-17,
- 0x1.c304efa2c6f4ep-17,
- 0x1.abe09e9144b5ep-17,
- 0x1.95df988e76644p-17,
- 0x1.80f439b4ee04bp-17,
- 0x1.6d11788a69c64p-17,
- 0x1.5a2adfa0b4bc4p-17,
- 0x1.4834877429b8fp-17,
- 0x1.37231085c7d9ap-17,
- 0x1.26eb9daed6f7ep-17,
- 0x1.1783ceac28910p-17,
- 0x1.08e1badf0fcedp-17,
- 0x1.f5f7d88472604p-18,
- 0x1.db92b5212fb8dp-18,
- 0x1.c282cd3957edap-18,
- 0x1.aab7abace48dcp-18,
- 0x1.94219bfcb4928p-18,
- 0x1.7eb1a2075864dp-18,
- 0x1.6a597219a93d9p-18,
- 0x1.570b69502f313p-18,
- 0x1.44ba864670882p-18,
- 0x1.335a62115bce2p-18,
- 0x1.22df298214423p-18,
- 0x1.133d96ae7e0ddp-18,
- 0x1.046aeabcfcdecp-18,
- 0x1.ecb9cfe1d8642p-19,
- 0x1.d21397ead99cbp-19,
- 0x1.b8d094c86d374p-19,
- 0x1.a0df0f0c626dcp-19,
- 0x1.8a2e269750a39p-19,
- 0x1.74adc8f4064d3p-19,
- 0x1.604ea819f007cp-19,
- 0x1.4d0231928c6f9p-19,
- 0x1.3aba85fe22e1fp-19,
- 0x1.296a70f414053p-19,
- 0x1.1905613b3abf2p-19,
- 0x1.097f6156f32c5p-19,
- 0x1.f59a20caf6695p-20,
- 0x1.d9c73698fb1dcp-20,
- 0x1.bf716c6168baep-20,
- 0x1.a6852c6b58392p-20,
- 0x1.8eefd70594a88p-20,
- 0x1.789fb715aae95p-20,
- 0x1.6383f726a8e04p-20,
- 0x1.4f8c96f26a26ap-20,
- 0x1.3caa61607f920p-20,
- 0x1.2acee2f5ecdb8p-20,
- 0x1.19ec60b1242edp-20,
- 0x1.09f5cf4dd2877p-20,
- 0x1.f5bd95d8730d8p-21,
- 0x1.d9371e2ff7c35p-21,
- 0x1.be41de54d155ap-21,
- 0x1.a4c89e08ef4f3p-21,
- 0x1.8cb738399b12cp-21,
- 0x1.75fa8dbc84becp-21,
- 0x1.608078a70dcbcp-21,
- 0x1.4c37c0394d094p-21,
- 0x1.39100d5687bfep-21,
- 0x1.26f9df8519bd6p-21,
- 0x1.15e6827001f18p-21,
- 0x1.05c803e4831c1p-21,
- 0x1.ed22548cffd35p-22,
- 0x1.d06ad6ecdf971p-22,
- 0x1.b551c847fbc96p-22,
- 0x1.9bc09f112b494p-22,
- 0x1.83a1ff0aa239dp-22,
- 0x1.6ce1aa3fd7bddp-22,
- 0x1.576c72b514859p-22,
- 0x1.43302cc4a0da8p-22,
- 0x1.301ba221dc9bbp-22,
- 0x1.1e1e857adc568p-22,
- 0x1.0d2966b1746f7p-22,
- 0x1.fa5b4f49cc6b2p-23,
- 0x1.dc3ae30b55c16p-23,
- 0x1.bfd7555a3bd68p-23,
- 0x1.a517d9e61628ap-23,
- 0x1.8be4f8f6c951fp-23,
- 0x1.74287ded49339p-23,
- 0x1.5dcd669f2cd34p-23,
- 0x1.48bfd38302870p-23,
- 0x1.34ecf8a3c124ap-23,
- 0x1.22430f521cbcfp-23,
- 0x1.10b1488aeb235p-23,
- 0x1.0027c00a263a6p-23,
- 0x1.e12ee004efc37p-24,
- 0x1.c3e44ae32b16bp-24,
- 0x1.a854ea14102a8p-24,
- 0x1.8e6761569f45dp-24,
- 0x1.7603bac345f65p-24,
- 0x1.5f1353cdad001p-24,
- 0x1.4980cb3c80949p-24,
- 0x1.3537f00b6ad4dp-24,
- 0x1.2225b12bffc68p-24,
- 0x1.10380e1adb7e9p-24,
- 0x1.febc107d5efaap-25,
- 0x1.df0f2a0ee6946p-25,
- 0x1.c14b2188bcee4p-25,
- 0x1.a553644f7f07dp-25,
- 0x1.8b0cfce0579dfp-25,
- 0x1.725e7c5dd20f7p-25,
- 0x1.5b2fe547a1340p-25,
- 0x1.456a974e92e93p-25,
- 0x1.30f93c3699078p-25,
- 0x1.1dc7b5b978cf8p-25,
- 0x1.0bc30c5d52f15p-25,
- 0x1.f5b2be65a0c7fp-26,
- 0x1.d5f3a8dea7357p-26,
- 0x1.b82915b03515bp-26,
- 0x1.9c3517e789488p-26,
- 0x1.81fb7df06136ep-26,
- 0x1.6961b8d641d06p-26,
- 0x1.524ec4d916caep-26,
- 0x1.3cab1343d18d1p-26,
- 0x1.2860757487a01p-26,
- 0x1.155a09065d4f7p-26,
- 0x1.0384250e4c9fcp-26,
- 0x1.e59890b926c78p-27,
- 0x1.c642116a8a9e3p-27,
- 0x1.a8e405e651ab6p-27,
- 0x1.8d5f98114f872p-27,
- 0x1.7397c5a66e307p-27,
- 0x1.5b71456c5a4c4p-27,
- 0x1.44d26de513197p-27,
- 0x1.2fa31d6371537p-27,
- 0x1.1bcca373b7b43p-27,
- 0x1.0939ab853339fp-27,
- 0x1.efac5187b2863p-28,
- 0x1.cf1e86235d0e6p-28,
- 0x1.b0a68a2128babp-28,
- 0x1.9423165bc4444p-28,
- 0x1.7974e743dea3cp-28,
- 0x1.607e9eacd1050p-28,
- 0x1.4924a74dec728p-28,
- 0x1.334d19e0c2160p-28,
- 0x1.1edfa3c5f5ccap-28,
- 0x1.0bc56f1b54701p-28,
- 0x1.f3d2185e047d9p-29,
- 0x1.d26cb87945e87p-29,
- 0x1.b334fac4b9f99p-29,
- 0x1.96076f7918d1cp-29,
- 0x1.7ac2d72fc2c63p-29,
- 0x1.614801550319ep-29,
- 0x1.4979ac8b28926p-29,
- 0x1.333c68e2d0548p-29,
- 0x1.1e767bce37dd7p-29,
- 0x1.0b0fc5b6d05a0p-29,
- 0x1.f1e3523b41d7dp-30,
- 0x1.d00de6608effep-30,
- 0x1.b0778b7b3301ap-30,
- 0x1.92fb04ec0f6cfp-30,
- 0x1.77756ec9f78fap-30,
- 0x1.5dc61922d5a06p-30,
- 0x1.45ce65699ff6dp-30,
- 0x1.2f71a5f159970p-30,
- 0x1.1a94ff571654fp-30,
- 0x1.071f4bbea09ecp-30,
- 0x1.e9f1ff8ddd774p-31,
- 0x1.c818223a202c7p-31,
- 0x1.a887bd2b4404dp-31,
- 0x1.8b1a336c5eb6bp-31,
- 0x1.6fab63324088ap-31,
- 0x1.56197e30205bap-31,
- 0x1.3e44e45301b92p-31,
- 0x1.281000bfe4c3fp-31,
- 0x1.135f28f2d50b4p-31,
- 0x1.00187dded5975p-31,
- 0x1.dc479de0ef001p-32,
- 0x1.bad4fdad3caa1p-32,
- 0x1.9baed3ed27ab8p-32,
- 0x1.7ead9ce4285bbp-32,
- 0x1.63ac6b4edc88ep-32,
- 0x1.4a88be2a6390cp-32,
- 0x1.332259185f1a0p-32,
- 0x1.1d5b1f3793044p-32,
- 0x1.0916f04b6e18bp-32,
- 0x1.ec77101de6926p-33,
- 0x1.c960bf23153e0p-33,
- 0x1.a8bd20fc65ef7p-33,
- 0x1.8a61745ec7d1dp-33,
- 0x1.6e25d0e756261p-33,
- 0x1.53e4f7d1666cbp-33,
- 0x1.3b7c27a7ddb0ep-33,
- 0x1.24caf2c32af14p-33,
- 0x1.0fb3186804d0fp-33,
- 0x1.f830c0bb41fd7p-34,
- 0x1.d3c0f1a91c846p-34,
- 0x1.b1e5acf351d87p-34,
- 0x1.92712d259ce66p-34,
- 0x1.7538c60a04476p-34,
- 0x1.5a14b04b47879p-34,
- 0x1.40dfd87456f4cp-34,
- 0x1.2977b1172b9d5p-34,
- 0x1.13bc07e891491p-34,
- 0x1.ff1dbb4300811p-35,
- 0x1.d9a880f306bd8p-35,
- 0x1.b6e45220b55e0p-35,
- 0x1.96a0b33f2c4dap-35,
- 0x1.78b07e9e924acp-35,
- 0x1.5ce9ab1670dd2p-35,
- 0x1.4325167006bb0p-35,
- 0x1.2b3e53538ff3fp-35,
- 0x1.15137a7f44864p-35,
- 0x1.0084ff125639dp-35,
- 0x1.daeb0b7311ec7p-36,
- 0x1.b7937d1c40c52p-36,
- 0x1.96d082f59ab06p-36,
- 0x1.7872d9fa10aadp-36,
- 0x1.5c4e8e37bc7d0p-36,
- 0x1.423ac0df49a40p-36,
- 0x1.2a117230ad284p-36,
- 0x1.13af4f04f9998p-36,
- 0x1.fde703724e560p-37,
- 0x1.d77f0c82e7641p-37,
- 0x1.b3ee02611d7ddp-37,
- 0x1.92ff33023d5bdp-37,
- 0x1.7481a9e69f53fp-37,
- 0x1.5847eda620959p-37,
- 0x1.3e27c1fcc74bdp-37,
- 0x1.25f9ee0b923dcp-37,
- 0x1.0f9a0686531ffp-37,
- 0x1.f5cc7718082afp-38,
- 0x1.cf7e53d6a2ca5p-38,
- 0x1.ac0f5f3229372p-38,
- 0x1.8b498644847eap-38,
- 0x1.6cfa9bcca59dcp-38,
- 0x1.50f411d4fd2cdp-38,
- 0x1.370ab8327af5ep-38,
- 0x1.1f167f88c6b6ep-38,
- 0x1.08f24085d4597p-38,
- 0x1.e8f70e181d619p-39,
- 0x1.c324c20e337dcp-39,
- 0x1.a03261574b54ep-39,
- 0x1.7fe903cdf5855p-39,
- 0x1.6215c58da3450p-39,
- 0x1.46897d4b69fc6p-39,
- 0x1.2d1877d731b7bp-39,
- 0x1.159a386b11517p-39,
- 0x1.ffd27ae9393cep-40,
- 0x1.d7c593130dd0bp-40,
- 0x1.b2cd607c79bcfp-40,
- 0x1.90ae4d3405651p-40,
- 0x1.71312dd1759e2p-40,
- 0x1.5422ef5d8949dp-40,
- 0x1.39544b0ecc957p-40,
- 0x1.20997f73e73ddp-40,
- 0x1.09ca0eaacd277p-40,
- 0x1.e9810295890ecp-41,
- 0x1.c2b45b5aa4a1dp-41,
- 0x1.9eee068fa7596p-41,
- 0x1.7df2b399c10a8p-41,
- 0x1.5f8b87a31bd85p-41,
- 0x1.4385c96e9a2d9p-41,
- 0x1.29b2933ef4cbcp-41,
- 0x1.11e68a6378f8ap-41,
- 0x1.f7f338086a86bp-42,
- 0x1.cf8d7d9ce040ap-42,
- 0x1.aa577251ae484p-42,
- 0x1.8811d739efb5ep-42,
- 0x1.68823e52970bep-42,
- 0x1.4b72ae68e8b4cp-42,
- 0x1.30b14dbe876bcp-42,
- 0x1.181012ef86610p-42,
- 0x1.01647ba798744p-42,
- 0x1.d90e917701675p-43,
- 0x1.b2a87e86d0c8ap-43,
- 0x1.8f53dcb377293p-43,
- 0x1.6ed2f2515e933p-43,
- 0x1.50ecc9ed47f19p-43,
- 0x1.356cd5ce7799ep-43,
- 0x1.1c229a587ab78p-43,
- 0x1.04e15ecc7f3f6p-43,
- 0x1.deffc7e6a6017p-44,
- 0x1.b7b040832f310p-44,
- 0x1.938e021f36d76p-44,
- 0x1.7258610b3b233p-44,
- 0x1.53d3bfc82a909p-44,
- 0x1.37c92babdc2fdp-44,
- 0x1.1e06010120f6ap-44,
- 0x1.065b9616170d4p-44,
- 0x1.e13dd96b3753ap-45,
- 0x1.b950d32467392p-45,
- 0x1.94a72263259a5p-45,
- 0x1.72fd93e036cdcp-45,
- 0x1.54164576929abp-45,
- 0x1.37b83c521fe96p-45,
- 0x1.1daf033182e96p-45,
- 0x1.05ca50205d26ap-45,
- 0x1.dfbb6235639fap-46,
- 0x1.b7807e294781fp-46,
- 0x1.9298add70a734p-46,
- 0x1.70beaf9c7ffb6p-46,
- 0x1.51b2cd6709222p-46,
- 0x1.353a6cf7f7fffp-46,
- 0x1.1b1fa8cbe84a7p-46,
- 0x1.0330f0fd69921p-46,
- 0x1.da81670f96f9bp-47,
- 0x1.b24a16b4d09aap-47,
- 0x1.8d6eeb6efdbd6p-47,
- 0x1.6ba91ac734785p-47,
- 0x1.4cb7966770ab5p-47,
- 0x1.305e9721d0981p-47,
- 0x1.1667311fff70ap-47,
- 0x1.fd3de10d62855p-48,
- 0x1.d1aefbcd48d0cp-48,
- 0x1.a9cc93c25aca9p-48,
- 0x1.85487ee3ea735p-48,
- 0x1.63daf8b4b1e0cp-48,
- 0x1.45421e69a6ca1p-48,
- 0x1.294175802d99ap-48,
- 0x1.0fa17bf41068fp-48,
- 0x1.f05e82aae2bb9p-49,
- 0x1.c578101b29058p-49,
- 0x1.9e39dc5dd2f7cp-49,
- 0x1.7a553a728bbf2p-49,
- 0x1.5982008db1304p-49,
- 0x1.3b7e00422e51bp-49,
- 0x1.200c898d9ee3ep-49,
- 0x1.06f5f7eb65a56p-49,
- 0x1.e00e9148a1d25p-50,
- 0x1.b623734024e92p-50,
- 0x1.8fd4e01891bf8p-50,
- 0x1.6cd44c7470d89p-50,
- 0x1.4cd9c04158cd7p-50,
- 0x1.2fa34bf5c8344p-50,
- 0x1.14f4890ff2461p-50,
- 0x1.f92c49dfa4df5p-51,
- 0x1.ccaaea71ab0dfp-51,
- 0x1.a40829f001197p-51,
- 0x1.7eef13b59e96cp-51,
- 0x1.5d11e1a252bf5p-51,
- 0x1.3e296303b2297p-51,
- 0x1.21f47009f43cep-51,
- 0x1.083768c5e4541p-51,
- 0x1.e1777d831265ep-52,
- 0x1.b69f10b0191b5p-52,
- 0x1.8f8a3a05b5b52p-52,
- 0x1.6be573c40c8e7p-52,
- 0x1.4b645ba991fdbp-52,
- 0x1.2dc119095729fp-52,
- },
-};
diff --git a/pl/math/sv_erff_data.c b/pl/math/sv_erff_data.c
deleted file mode 100644
index 154d3c188874..000000000000
--- a/pl/math/sv_erff_data.c
+++ /dev/null
@@ -1,1046 +0,0 @@
-/*
- * Data for approximation of vector erff.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Lookup table used in SVE erff.
- For each possible rounded input r (multiples of 1/128), between
- r = 0.0 and r = 4.0 (513 values):
- - __erff_data.erf contains the values of erf(r),
- - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2).
- Note that indices 0 and 1 are never hit by the algorithm, since lookup is
- performed only for x >= 1/64-1/512. */
-const struct sv_erff_data __sv_erff_data = {
- .erf = { 0x0.000000p+0,
- 0x1.20dbf4p-7,
- 0x1.20d770p-6,
- 0x1.b137e0p-6,
- 0x1.20c564p-5,
- 0x1.68e5d4p-5,
- 0x1.b0fafep-5,
- 0x1.f902a8p-5,
- 0x1.207d48p-4,
- 0x1.44703ep-4,
- 0x1.68591ap-4,
- 0x1.8c36bep-4,
- 0x1.b00812p-4,
- 0x1.d3cbf8p-4,
- 0x1.f7815ap-4,
- 0x1.0d9390p-3,
- 0x1.1f5e1ap-3,
- 0x1.311fc2p-3,
- 0x1.42d7fcp-3,
- 0x1.548642p-3,
- 0x1.662a0cp-3,
- 0x1.77c2d2p-3,
- 0x1.895010p-3,
- 0x1.9ad142p-3,
- 0x1.ac45e4p-3,
- 0x1.bdad72p-3,
- 0x1.cf076ep-3,
- 0x1.e05354p-3,
- 0x1.f190aap-3,
- 0x1.015f78p-2,
- 0x1.09eed6p-2,
- 0x1.127632p-2,
- 0x1.1af54ep-2,
- 0x1.236bf0p-2,
- 0x1.2bd9dcp-2,
- 0x1.343ed6p-2,
- 0x1.3c9aa8p-2,
- 0x1.44ed18p-2,
- 0x1.4d35f0p-2,
- 0x1.5574f4p-2,
- 0x1.5da9f4p-2,
- 0x1.65d4b8p-2,
- 0x1.6df50ap-2,
- 0x1.760abap-2,
- 0x1.7e1594p-2,
- 0x1.861566p-2,
- 0x1.8e0a02p-2,
- 0x1.95f336p-2,
- 0x1.9dd0d2p-2,
- 0x1.a5a2acp-2,
- 0x1.ad6896p-2,
- 0x1.b52264p-2,
- 0x1.bccfecp-2,
- 0x1.c47104p-2,
- 0x1.cc0584p-2,
- 0x1.d38d44p-2,
- 0x1.db081cp-2,
- 0x1.e275eap-2,
- 0x1.e9d68ap-2,
- 0x1.f129d4p-2,
- 0x1.f86faap-2,
- 0x1.ffa7eap-2,
- 0x1.03693ap-1,
- 0x1.06f794p-1,
- 0x1.0a7ef6p-1,
- 0x1.0dff50p-1,
- 0x1.117894p-1,
- 0x1.14eab4p-1,
- 0x1.1855a6p-1,
- 0x1.1bb95cp-1,
- 0x1.1f15ccp-1,
- 0x1.226ae8p-1,
- 0x1.25b8a8p-1,
- 0x1.28ff02p-1,
- 0x1.2c3decp-1,
- 0x1.2f755cp-1,
- 0x1.32a54cp-1,
- 0x1.35cdb4p-1,
- 0x1.38ee8ap-1,
- 0x1.3c07cap-1,
- 0x1.3f196ep-1,
- 0x1.42236ep-1,
- 0x1.4525c8p-1,
- 0x1.482074p-1,
- 0x1.4b1372p-1,
- 0x1.4dfebap-1,
- 0x1.50e24cp-1,
- 0x1.53be26p-1,
- 0x1.569244p-1,
- 0x1.595ea6p-1,
- 0x1.5c2348p-1,
- 0x1.5ee02ep-1,
- 0x1.619556p-1,
- 0x1.6442c0p-1,
- 0x1.66e86ep-1,
- 0x1.69865ep-1,
- 0x1.6c1c98p-1,
- 0x1.6eab18p-1,
- 0x1.7131e6p-1,
- 0x1.73b102p-1,
- 0x1.762870p-1,
- 0x1.789836p-1,
- 0x1.7b0058p-1,
- 0x1.7d60d8p-1,
- 0x1.7fb9c0p-1,
- 0x1.820b12p-1,
- 0x1.8454d6p-1,
- 0x1.869712p-1,
- 0x1.88d1cep-1,
- 0x1.8b050ep-1,
- 0x1.8d30dep-1,
- 0x1.8f5544p-1,
- 0x1.91724ap-1,
- 0x1.9387f6p-1,
- 0x1.959652p-1,
- 0x1.979d68p-1,
- 0x1.999d42p-1,
- 0x1.9b95e8p-1,
- 0x1.9d8768p-1,
- 0x1.9f71cap-1,
- 0x1.a1551ap-1,
- 0x1.a33162p-1,
- 0x1.a506b0p-1,
- 0x1.a6d50cp-1,
- 0x1.a89c86p-1,
- 0x1.aa5d26p-1,
- 0x1.ac16fcp-1,
- 0x1.adca14p-1,
- 0x1.af767ap-1,
- 0x1.b11c3cp-1,
- 0x1.b2bb68p-1,
- 0x1.b4540ap-1,
- 0x1.b5e630p-1,
- 0x1.b771e8p-1,
- 0x1.b8f742p-1,
- 0x1.ba764ap-1,
- 0x1.bbef10p-1,
- 0x1.bd61a2p-1,
- 0x1.bece0ep-1,
- 0x1.c03464p-1,
- 0x1.c194b2p-1,
- 0x1.c2ef08p-1,
- 0x1.c44376p-1,
- 0x1.c5920ap-1,
- 0x1.c6dad2p-1,
- 0x1.c81de2p-1,
- 0x1.c95b46p-1,
- 0x1.ca930ep-1,
- 0x1.cbc54cp-1,
- 0x1.ccf20cp-1,
- 0x1.ce1962p-1,
- 0x1.cf3b5cp-1,
- 0x1.d0580cp-1,
- 0x1.d16f7ep-1,
- 0x1.d281c4p-1,
- 0x1.d38ef0p-1,
- 0x1.d49710p-1,
- 0x1.d59a34p-1,
- 0x1.d6986cp-1,
- 0x1.d791cap-1,
- 0x1.d8865ep-1,
- 0x1.d97636p-1,
- 0x1.da6162p-1,
- 0x1.db47f4p-1,
- 0x1.dc29fcp-1,
- 0x1.dd0788p-1,
- 0x1.dde0aap-1,
- 0x1.deb570p-1,
- 0x1.df85eap-1,
- 0x1.e0522ap-1,
- 0x1.e11a3ep-1,
- 0x1.e1de36p-1,
- 0x1.e29e22p-1,
- 0x1.e35a12p-1,
- 0x1.e41214p-1,
- 0x1.e4c638p-1,
- 0x1.e5768cp-1,
- 0x1.e62322p-1,
- 0x1.e6cc08p-1,
- 0x1.e7714ap-1,
- 0x1.e812fcp-1,
- 0x1.e8b12ap-1,
- 0x1.e94be4p-1,
- 0x1.e9e336p-1,
- 0x1.ea7730p-1,
- 0x1.eb07e2p-1,
- 0x1.eb9558p-1,
- 0x1.ec1fa2p-1,
- 0x1.eca6ccp-1,
- 0x1.ed2ae6p-1,
- 0x1.edabfcp-1,
- 0x1.ee2a1ep-1,
- 0x1.eea556p-1,
- 0x1.ef1db4p-1,
- 0x1.ef9344p-1,
- 0x1.f00614p-1,
- 0x1.f07630p-1,
- 0x1.f0e3a6p-1,
- 0x1.f14e82p-1,
- 0x1.f1b6d0p-1,
- 0x1.f21ca0p-1,
- 0x1.f27ff8p-1,
- 0x1.f2e0eap-1,
- 0x1.f33f7ep-1,
- 0x1.f39bc2p-1,
- 0x1.f3f5c2p-1,
- 0x1.f44d88p-1,
- 0x1.f4a31ep-1,
- 0x1.f4f694p-1,
- 0x1.f547f2p-1,
- 0x1.f59742p-1,
- 0x1.f5e490p-1,
- 0x1.f62fe8p-1,
- 0x1.f67952p-1,
- 0x1.f6c0dcp-1,
- 0x1.f7068cp-1,
- 0x1.f74a6ep-1,
- 0x1.f78c8cp-1,
- 0x1.f7cceep-1,
- 0x1.f80ba2p-1,
- 0x1.f848acp-1,
- 0x1.f8841ap-1,
- 0x1.f8bdf2p-1,
- 0x1.f8f63ep-1,
- 0x1.f92d08p-1,
- 0x1.f96256p-1,
- 0x1.f99634p-1,
- 0x1.f9c8a8p-1,
- 0x1.f9f9bap-1,
- 0x1.fa2974p-1,
- 0x1.fa57dep-1,
- 0x1.fa84fep-1,
- 0x1.fab0dep-1,
- 0x1.fadb84p-1,
- 0x1.fb04f6p-1,
- 0x1.fb2d40p-1,
- 0x1.fb5464p-1,
- 0x1.fb7a6cp-1,
- 0x1.fb9f60p-1,
- 0x1.fbc344p-1,
- 0x1.fbe61ep-1,
- 0x1.fc07fap-1,
- 0x1.fc28d8p-1,
- 0x1.fc48c2p-1,
- 0x1.fc67bcp-1,
- 0x1.fc85d0p-1,
- 0x1.fca2fep-1,
- 0x1.fcbf52p-1,
- 0x1.fcdaccp-1,
- 0x1.fcf576p-1,
- 0x1.fd0f54p-1,
- 0x1.fd286ap-1,
- 0x1.fd40bep-1,
- 0x1.fd5856p-1,
- 0x1.fd6f34p-1,
- 0x1.fd8562p-1,
- 0x1.fd9ae2p-1,
- 0x1.fdafb8p-1,
- 0x1.fdc3e8p-1,
- 0x1.fdd77ap-1,
- 0x1.fdea6ep-1,
- 0x1.fdfcccp-1,
- 0x1.fe0e96p-1,
- 0x1.fe1fd0p-1,
- 0x1.fe3080p-1,
- 0x1.fe40a6p-1,
- 0x1.fe504cp-1,
- 0x1.fe5f70p-1,
- 0x1.fe6e18p-1,
- 0x1.fe7c46p-1,
- 0x1.fe8a00p-1,
- 0x1.fe9748p-1,
- 0x1.fea422p-1,
- 0x1.feb090p-1,
- 0x1.febc96p-1,
- 0x1.fec836p-1,
- 0x1.fed374p-1,
- 0x1.fede52p-1,
- 0x1.fee8d4p-1,
- 0x1.fef2fep-1,
- 0x1.fefccep-1,
- 0x1.ff064cp-1,
- 0x1.ff0f76p-1,
- 0x1.ff1852p-1,
- 0x1.ff20e0p-1,
- 0x1.ff2924p-1,
- 0x1.ff3120p-1,
- 0x1.ff38d6p-1,
- 0x1.ff4048p-1,
- 0x1.ff4778p-1,
- 0x1.ff4e68p-1,
- 0x1.ff551ap-1,
- 0x1.ff5b90p-1,
- 0x1.ff61ccp-1,
- 0x1.ff67d0p-1,
- 0x1.ff6d9ep-1,
- 0x1.ff7338p-1,
- 0x1.ff789ep-1,
- 0x1.ff7dd4p-1,
- 0x1.ff82dap-1,
- 0x1.ff87b2p-1,
- 0x1.ff8c5cp-1,
- 0x1.ff90dcp-1,
- 0x1.ff9532p-1,
- 0x1.ff9960p-1,
- 0x1.ff9d68p-1,
- 0x1.ffa14ap-1,
- 0x1.ffa506p-1,
- 0x1.ffa8a0p-1,
- 0x1.ffac18p-1,
- 0x1.ffaf6ep-1,
- 0x1.ffb2a6p-1,
- 0x1.ffb5bep-1,
- 0x1.ffb8b8p-1,
- 0x1.ffbb98p-1,
- 0x1.ffbe5ap-1,
- 0x1.ffc102p-1,
- 0x1.ffc390p-1,
- 0x1.ffc606p-1,
- 0x1.ffc862p-1,
- 0x1.ffcaa8p-1,
- 0x1.ffccd8p-1,
- 0x1.ffcef4p-1,
- 0x1.ffd0fap-1,
- 0x1.ffd2eap-1,
- 0x1.ffd4cap-1,
- 0x1.ffd696p-1,
- 0x1.ffd84ep-1,
- 0x1.ffd9f8p-1,
- 0x1.ffdb90p-1,
- 0x1.ffdd18p-1,
- 0x1.ffde90p-1,
- 0x1.ffdffap-1,
- 0x1.ffe154p-1,
- 0x1.ffe2a2p-1,
- 0x1.ffe3e2p-1,
- 0x1.ffe514p-1,
- 0x1.ffe63cp-1,
- 0x1.ffe756p-1,
- 0x1.ffe866p-1,
- 0x1.ffe96ap-1,
- 0x1.ffea64p-1,
- 0x1.ffeb54p-1,
- 0x1.ffec3ap-1,
- 0x1.ffed16p-1,
- 0x1.ffedeap-1,
- 0x1.ffeeb4p-1,
- 0x1.ffef76p-1,
- 0x1.fff032p-1,
- 0x1.fff0e4p-1,
- 0x1.fff18ep-1,
- 0x1.fff232p-1,
- 0x1.fff2d0p-1,
- 0x1.fff366p-1,
- 0x1.fff3f6p-1,
- 0x1.fff480p-1,
- 0x1.fff504p-1,
- 0x1.fff582p-1,
- 0x1.fff5fcp-1,
- 0x1.fff670p-1,
- 0x1.fff6dep-1,
- 0x1.fff74ap-1,
- 0x1.fff7aep-1,
- 0x1.fff810p-1,
- 0x1.fff86cp-1,
- 0x1.fff8c6p-1,
- 0x1.fff91cp-1,
- 0x1.fff96cp-1,
- 0x1.fff9bap-1,
- 0x1.fffa04p-1,
- 0x1.fffa4cp-1,
- 0x1.fffa90p-1,
- 0x1.fffad0p-1,
- 0x1.fffb0ep-1,
- 0x1.fffb4ap-1,
- 0x1.fffb82p-1,
- 0x1.fffbb8p-1,
- 0x1.fffbecp-1,
- 0x1.fffc1ep-1,
- 0x1.fffc4ep-1,
- 0x1.fffc7ap-1,
- 0x1.fffca6p-1,
- 0x1.fffccep-1,
- 0x1.fffcf6p-1,
- 0x1.fffd1ap-1,
- 0x1.fffd3ep-1,
- 0x1.fffd60p-1,
- 0x1.fffd80p-1,
- 0x1.fffda0p-1,
- 0x1.fffdbep-1,
- 0x1.fffddap-1,
- 0x1.fffdf4p-1,
- 0x1.fffe0ep-1,
- 0x1.fffe26p-1,
- 0x1.fffe3ep-1,
- 0x1.fffe54p-1,
- 0x1.fffe68p-1,
- 0x1.fffe7ep-1,
- 0x1.fffe90p-1,
- 0x1.fffea2p-1,
- 0x1.fffeb4p-1,
- 0x1.fffec4p-1,
- 0x1.fffed4p-1,
- 0x1.fffee4p-1,
- 0x1.fffef2p-1,
- 0x1.ffff00p-1,
- 0x1.ffff0cp-1,
- 0x1.ffff18p-1,
- 0x1.ffff24p-1,
- 0x1.ffff30p-1,
- 0x1.ffff3ap-1,
- 0x1.ffff44p-1,
- 0x1.ffff4ep-1,
- 0x1.ffff56p-1,
- 0x1.ffff60p-1,
- 0x1.ffff68p-1,
- 0x1.ffff70p-1,
- 0x1.ffff78p-1,
- 0x1.ffff7ep-1,
- 0x1.ffff84p-1,
- 0x1.ffff8cp-1,
- 0x1.ffff92p-1,
- 0x1.ffff98p-1,
- 0x1.ffff9cp-1,
- 0x1.ffffa2p-1,
- 0x1.ffffa6p-1,
- 0x1.ffffacp-1,
- 0x1.ffffb0p-1,
- 0x1.ffffb4p-1,
- 0x1.ffffb8p-1,
- 0x1.ffffbcp-1,
- 0x1.ffffc0p-1,
- 0x1.ffffc4p-1,
- 0x1.ffffc6p-1,
- 0x1.ffffcap-1,
- 0x1.ffffccp-1,
- 0x1.ffffd0p-1,
- 0x1.ffffd2p-1,
- 0x1.ffffd4p-1,
- 0x1.ffffd6p-1,
- 0x1.ffffd8p-1,
- 0x1.ffffdcp-1,
- 0x1.ffffdep-1,
- 0x1.ffffdep-1,
- 0x1.ffffe0p-1,
- 0x1.ffffe2p-1,
- 0x1.ffffe4p-1,
- 0x1.ffffe6p-1,
- 0x1.ffffe8p-1,
- 0x1.ffffe8p-1,
- 0x1.ffffeap-1,
- 0x1.ffffeap-1,
- 0x1.ffffecp-1,
- 0x1.ffffeep-1,
- 0x1.ffffeep-1,
- 0x1.fffff0p-1,
- 0x1.fffff0p-1,
- 0x1.fffff2p-1,
- 0x1.fffff2p-1,
- 0x1.fffff2p-1,
- 0x1.fffff4p-1,
- 0x1.fffff4p-1,
- 0x1.fffff4p-1,
- 0x1.fffff6p-1,
- 0x1.fffff6p-1,
- 0x1.fffff6p-1,
- 0x1.fffff8p-1,
- 0x1.fffff8p-1,
- 0x1.fffff8p-1,
- 0x1.fffff8p-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- },
- .scale = { 0x1.20dd76p+0,
- 0x1.20d8f2p+0,
- 0x1.20cb68p+0,
- 0x1.20b4d8p+0,
- 0x1.209546p+0,
- 0x1.206cb4p+0,
- 0x1.203b26p+0,
- 0x1.2000a0p+0,
- 0x1.1fbd28p+0,
- 0x1.1f70c4p+0,
- 0x1.1f1b7ap+0,
- 0x1.1ebd56p+0,
- 0x1.1e565cp+0,
- 0x1.1de698p+0,
- 0x1.1d6e14p+0,
- 0x1.1cecdcp+0,
- 0x1.1c62fap+0,
- 0x1.1bd07cp+0,
- 0x1.1b3572p+0,
- 0x1.1a91e6p+0,
- 0x1.19e5eap+0,
- 0x1.19318cp+0,
- 0x1.1874dep+0,
- 0x1.17aff0p+0,
- 0x1.16e2d8p+0,
- 0x1.160da4p+0,
- 0x1.153068p+0,
- 0x1.144b3cp+0,
- 0x1.135e30p+0,
- 0x1.12695ep+0,
- 0x1.116cd8p+0,
- 0x1.1068bap+0,
- 0x1.0f5d16p+0,
- 0x1.0e4a08p+0,
- 0x1.0d2fa6p+0,
- 0x1.0c0e0ap+0,
- 0x1.0ae550p+0,
- 0x1.09b590p+0,
- 0x1.087ee4p+0,
- 0x1.07416cp+0,
- 0x1.05fd3ep+0,
- 0x1.04b27cp+0,
- 0x1.036140p+0,
- 0x1.0209a6p+0,
- 0x1.00abd0p+0,
- 0x1.fe8fb0p-1,
- 0x1.fbbbbep-1,
- 0x1.f8dc0ap-1,
- 0x1.f5f0cep-1,
- 0x1.f2fa4cp-1,
- 0x1.eff8c4p-1,
- 0x1.ecec78p-1,
- 0x1.e9d5a8p-1,
- 0x1.e6b498p-1,
- 0x1.e38988p-1,
- 0x1.e054bep-1,
- 0x1.dd167cp-1,
- 0x1.d9cf06p-1,
- 0x1.d67ea2p-1,
- 0x1.d32592p-1,
- 0x1.cfc41ep-1,
- 0x1.cc5a8ap-1,
- 0x1.c8e91cp-1,
- 0x1.c5701ap-1,
- 0x1.c1efcap-1,
- 0x1.be6872p-1,
- 0x1.bada5ap-1,
- 0x1.b745c6p-1,
- 0x1.b3aafcp-1,
- 0x1.b00a46p-1,
- 0x1.ac63e8p-1,
- 0x1.a8b828p-1,
- 0x1.a5074ep-1,
- 0x1.a1519ep-1,
- 0x1.9d9762p-1,
- 0x1.99d8dap-1,
- 0x1.961650p-1,
- 0x1.925008p-1,
- 0x1.8e8646p-1,
- 0x1.8ab950p-1,
- 0x1.86e96ap-1,
- 0x1.8316d6p-1,
- 0x1.7f41dcp-1,
- 0x1.7b6abcp-1,
- 0x1.7791b8p-1,
- 0x1.73b714p-1,
- 0x1.6fdb12p-1,
- 0x1.6bfdf0p-1,
- 0x1.681ff2p-1,
- 0x1.644156p-1,
- 0x1.60625cp-1,
- 0x1.5c8342p-1,
- 0x1.58a446p-1,
- 0x1.54c5a6p-1,
- 0x1.50e79ep-1,
- 0x1.4d0a68p-1,
- 0x1.492e42p-1,
- 0x1.455366p-1,
- 0x1.417a0cp-1,
- 0x1.3da26ep-1,
- 0x1.39ccc2p-1,
- 0x1.35f940p-1,
- 0x1.32281ep-1,
- 0x1.2e5992p-1,
- 0x1.2a8dcep-1,
- 0x1.26c508p-1,
- 0x1.22ff72p-1,
- 0x1.1f3d3cp-1,
- 0x1.1b7e98p-1,
- 0x1.17c3b6p-1,
- 0x1.140cc4p-1,
- 0x1.1059eep-1,
- 0x1.0cab62p-1,
- 0x1.09014cp-1,
- 0x1.055bd6p-1,
- 0x1.01bb2cp-1,
- 0x1.fc3ee6p-2,
- 0x1.f511aap-2,
- 0x1.edeeeep-2,
- 0x1.e6d700p-2,
- 0x1.dfca26p-2,
- 0x1.d8c8aap-2,
- 0x1.d1d2d0p-2,
- 0x1.cae8dap-2,
- 0x1.c40b08p-2,
- 0x1.bd3998p-2,
- 0x1.b674c8p-2,
- 0x1.afbcd4p-2,
- 0x1.a911f0p-2,
- 0x1.a27456p-2,
- 0x1.9be438p-2,
- 0x1.9561c8p-2,
- 0x1.8eed36p-2,
- 0x1.8886b2p-2,
- 0x1.822e66p-2,
- 0x1.7be47ap-2,
- 0x1.75a91ap-2,
- 0x1.6f7c6ap-2,
- 0x1.695e8cp-2,
- 0x1.634fa6p-2,
- 0x1.5d4fd4p-2,
- 0x1.575f34p-2,
- 0x1.517de6p-2,
- 0x1.4bac00p-2,
- 0x1.45e99cp-2,
- 0x1.4036d0p-2,
- 0x1.3a93b2p-2,
- 0x1.350052p-2,
- 0x1.2f7cc4p-2,
- 0x1.2a0916p-2,
- 0x1.24a554p-2,
- 0x1.1f518ap-2,
- 0x1.1a0dc6p-2,
- 0x1.14da0ap-2,
- 0x1.0fb662p-2,
- 0x1.0aa2d0p-2,
- 0x1.059f5ap-2,
- 0x1.00ac00p-2,
- 0x1.f79184p-3,
- 0x1.edeb40p-3,
- 0x1.e46530p-3,
- 0x1.daff4ap-3,
- 0x1.d1b982p-3,
- 0x1.c893cep-3,
- 0x1.bf8e1cp-3,
- 0x1.b6a856p-3,
- 0x1.ade26cp-3,
- 0x1.a53c42p-3,
- 0x1.9cb5bep-3,
- 0x1.944ec2p-3,
- 0x1.8c0732p-3,
- 0x1.83deeap-3,
- 0x1.7bd5c8p-3,
- 0x1.73eba4p-3,
- 0x1.6c2056p-3,
- 0x1.6473b6p-3,
- 0x1.5ce596p-3,
- 0x1.5575c8p-3,
- 0x1.4e241ep-3,
- 0x1.46f066p-3,
- 0x1.3fda6cp-3,
- 0x1.38e1fap-3,
- 0x1.3206dcp-3,
- 0x1.2b48dap-3,
- 0x1.24a7b8p-3,
- 0x1.1e233ep-3,
- 0x1.17bb2cp-3,
- 0x1.116f48p-3,
- 0x1.0b3f52p-3,
- 0x1.052b0cp-3,
- 0x1.fe6460p-4,
- 0x1.f2a902p-4,
- 0x1.e72372p-4,
- 0x1.dbd32ap-4,
- 0x1.d0b7a0p-4,
- 0x1.c5d04ap-4,
- 0x1.bb1c98p-4,
- 0x1.b09bfcp-4,
- 0x1.a64de6p-4,
- 0x1.9c31c6p-4,
- 0x1.92470ap-4,
- 0x1.888d1ep-4,
- 0x1.7f036cp-4,
- 0x1.75a960p-4,
- 0x1.6c7e64p-4,
- 0x1.6381e2p-4,
- 0x1.5ab342p-4,
- 0x1.5211ecp-4,
- 0x1.499d48p-4,
- 0x1.4154bcp-4,
- 0x1.3937b2p-4,
- 0x1.31458ep-4,
- 0x1.297dbap-4,
- 0x1.21df9ap-4,
- 0x1.1a6a96p-4,
- 0x1.131e14p-4,
- 0x1.0bf97ep-4,
- 0x1.04fc3ap-4,
- 0x1.fc4b5ep-5,
- 0x1.eeea8cp-5,
- 0x1.e1d4d0p-5,
- 0x1.d508fap-5,
- 0x1.c885e0p-5,
- 0x1.bc4a54p-5,
- 0x1.b05530p-5,
- 0x1.a4a54ap-5,
- 0x1.99397ap-5,
- 0x1.8e109cp-5,
- 0x1.83298ep-5,
- 0x1.78832cp-5,
- 0x1.6e1c58p-5,
- 0x1.63f3f6p-5,
- 0x1.5a08e8p-5,
- 0x1.505a18p-5,
- 0x1.46e66cp-5,
- 0x1.3dacd2p-5,
- 0x1.34ac36p-5,
- 0x1.2be38cp-5,
- 0x1.2351c2p-5,
- 0x1.1af5d2p-5,
- 0x1.12ceb4p-5,
- 0x1.0adb60p-5,
- 0x1.031ad6p-5,
- 0x1.f7182ap-6,
- 0x1.e85c44p-6,
- 0x1.da0006p-6,
- 0x1.cc0180p-6,
- 0x1.be5ecep-6,
- 0x1.b1160ap-6,
- 0x1.a4255ap-6,
- 0x1.978ae8p-6,
- 0x1.8b44e6p-6,
- 0x1.7f5188p-6,
- 0x1.73af0cp-6,
- 0x1.685bb6p-6,
- 0x1.5d55ccp-6,
- 0x1.529b9ep-6,
- 0x1.482b84p-6,
- 0x1.3e03d8p-6,
- 0x1.3422fep-6,
- 0x1.2a875cp-6,
- 0x1.212f62p-6,
- 0x1.181984p-6,
- 0x1.0f443ep-6,
- 0x1.06ae14p-6,
- 0x1.fcab14p-7,
- 0x1.ec7262p-7,
- 0x1.dcaf36p-7,
- 0x1.cd5ecap-7,
- 0x1.be7e5ap-7,
- 0x1.b00b38p-7,
- 0x1.a202bep-7,
- 0x1.94624ep-7,
- 0x1.87275ep-7,
- 0x1.7a4f6ap-7,
- 0x1.6dd7fep-7,
- 0x1.61beaep-7,
- 0x1.56011cp-7,
- 0x1.4a9cf6p-7,
- 0x1.3f8ff6p-7,
- 0x1.34d7dcp-7,
- 0x1.2a727ap-7,
- 0x1.205dacp-7,
- 0x1.169756p-7,
- 0x1.0d1d6ap-7,
- 0x1.03ede2p-7,
- 0x1.f60d8ap-8,
- 0x1.e4cc4ap-8,
- 0x1.d4143ap-8,
- 0x1.c3e1a6p-8,
- 0x1.b430ecp-8,
- 0x1.a4fe84p-8,
- 0x1.9646f4p-8,
- 0x1.8806d8p-8,
- 0x1.7a3adep-8,
- 0x1.6cdfccp-8,
- 0x1.5ff276p-8,
- 0x1.536fc2p-8,
- 0x1.4754acp-8,
- 0x1.3b9e40p-8,
- 0x1.30499cp-8,
- 0x1.2553eep-8,
- 0x1.1aba78p-8,
- 0x1.107a8cp-8,
- 0x1.06918cp-8,
- 0x1.f9f9d0p-9,
- 0x1.e77448p-9,
- 0x1.d58da6p-9,
- 0x1.c4412cp-9,
- 0x1.b38a3ap-9,
- 0x1.a36454p-9,
- 0x1.93cb12p-9,
- 0x1.84ba30p-9,
- 0x1.762d84p-9,
- 0x1.682100p-9,
- 0x1.5a90b0p-9,
- 0x1.4d78bcp-9,
- 0x1.40d564p-9,
- 0x1.34a306p-9,
- 0x1.28de12p-9,
- 0x1.1d8318p-9,
- 0x1.128ebap-9,
- 0x1.07fdb4p-9,
- 0x1.fb99b8p-10,
- 0x1.e7f232p-10,
- 0x1.d4fed8p-10,
- 0x1.c2b9d0p-10,
- 0x1.b11d70p-10,
- 0x1.a02436p-10,
- 0x1.8fc8c8p-10,
- 0x1.8005f0p-10,
- 0x1.70d6a4p-10,
- 0x1.6235fcp-10,
- 0x1.541f34p-10,
- 0x1.468daep-10,
- 0x1.397ceep-10,
- 0x1.2ce898p-10,
- 0x1.20cc76p-10,
- 0x1.15246ep-10,
- 0x1.09ec86p-10,
- 0x1.fe41cep-11,
- 0x1.e97ba4p-11,
- 0x1.d57f52p-11,
- 0x1.c245d4p-11,
- 0x1.afc85ep-11,
- 0x1.9e0058p-11,
- 0x1.8ce75ep-11,
- 0x1.7c7744p-11,
- 0x1.6caa0ep-11,
- 0x1.5d79ecp-11,
- 0x1.4ee142p-11,
- 0x1.40daa4p-11,
- 0x1.3360ccp-11,
- 0x1.266ea8p-11,
- 0x1.19ff46p-11,
- 0x1.0e0de8p-11,
- 0x1.0295f0p-11,
- 0x1.ef25d4p-12,
- 0x1.da0110p-12,
- 0x1.c5b542p-12,
- 0x1.b23a5ap-12,
- 0x1.9f8894p-12,
- 0x1.8d986ap-12,
- 0x1.7c629ap-12,
- 0x1.6be022p-12,
- 0x1.5c0a38p-12,
- 0x1.4cda54p-12,
- 0x1.3e4a24p-12,
- 0x1.305390p-12,
- 0x1.22f0b4p-12,
- 0x1.161be4p-12,
- 0x1.09cfa4p-12,
- 0x1.fc0d56p-13,
- 0x1.e577bcp-13,
- 0x1.cfd4a6p-13,
- 0x1.bb1a96p-13,
- 0x1.a74068p-13,
- 0x1.943d4ap-13,
- 0x1.8208bcp-13,
- 0x1.709a8ep-13,
- 0x1.5feadap-13,
- 0x1.4ff208p-13,
- 0x1.40a8c2p-13,
- 0x1.3207fcp-13,
- 0x1.2408eap-13,
- 0x1.16a502p-13,
- 0x1.09d5f8p-13,
- 0x1.fb2b7ap-14,
- 0x1.e3bcf4p-14,
- 0x1.cd5528p-14,
- 0x1.b7e946p-14,
- 0x1.a36eecp-14,
- 0x1.8fdc1cp-14,
- 0x1.7d2738p-14,
- 0x1.6b4702p-14,
- 0x1.5a329cp-14,
- 0x1.49e178p-14,
- 0x1.3a4b60p-14,
- 0x1.2b6876p-14,
- 0x1.1d3120p-14,
- 0x1.0f9e1cp-14,
- 0x1.02a868p-14,
- 0x1.ec929ap-15,
- 0x1.d4f4b4p-15,
- 0x1.be6abcp-15,
- 0x1.a8e8ccp-15,
- 0x1.94637ep-15,
- 0x1.80cfdcp-15,
- 0x1.6e2368p-15,
- 0x1.5c540cp-15,
- 0x1.4b581cp-15,
- 0x1.3b2652p-15,
- 0x1.2bb5ccp-15,
- 0x1.1cfe02p-15,
- 0x1.0ef6c4p-15,
- 0x1.019842p-15,
- 0x1.e9b5e8p-16,
- 0x1.d16f58p-16,
- 0x1.ba4f04p-16,
- 0x1.a447b8p-16,
- 0x1.8f4cccp-16,
- 0x1.7b5224p-16,
- 0x1.684c22p-16,
- 0x1.562facp-16,
- 0x1.44f21ep-16,
- 0x1.34894ap-16,
- 0x1.24eb72p-16,
- 0x1.160f44p-16,
- 0x1.07ebd2p-16,
- 0x1.f4f12ep-17,
- 0x1.db5ad0p-17,
- 0x1.c304f0p-17,
- 0x1.abe09ep-17,
- 0x1.95df98p-17,
- 0x1.80f43ap-17,
- 0x1.6d1178p-17,
- 0x1.5a2ae0p-17,
- 0x1.483488p-17,
- 0x1.372310p-17,
- 0x1.26eb9ep-17,
- 0x1.1783cep-17,
- 0x1.08e1bap-17,
- 0x1.f5f7d8p-18,
- 0x1.db92b6p-18,
- 0x1.c282cep-18,
- 0x1.aab7acp-18,
- 0x1.94219cp-18,
- 0x1.7eb1a2p-18,
- 0x1.6a5972p-18,
- 0x1.570b6ap-18,
- 0x1.44ba86p-18,
- 0x1.335a62p-18,
- 0x1.22df2ap-18,
- 0x1.133d96p-18,
- 0x1.046aeap-18,
- 0x1.ecb9d0p-19,
- 0x1.d21398p-19,
- 0x1.b8d094p-19,
- 0x1.a0df10p-19,
- 0x1.8a2e26p-19,
- 0x1.74adc8p-19,
- 0x1.604ea8p-19,
- 0x1.4d0232p-19,
- 0x1.3aba86p-19,
- 0x1.296a70p-19,
- 0x1.190562p-19,
- 0x1.097f62p-19,
- 0x1.f59a20p-20,
- 0x1.d9c736p-20,
- 0x1.bf716cp-20,
- 0x1.a6852cp-20,
- 0x1.8eefd8p-20,
- 0x1.789fb8p-20,
- 0x1.6383f8p-20,
- 0x1.4f8c96p-20,
- 0x1.3caa62p-20,
- 0x1.2acee2p-20,
- 0x1.19ec60p-20,
- 0x1.09f5d0p-20,
- 0x1.f5bd96p-21,
- 0x1.d9371ep-21,
- 0x1.be41dep-21,
- 0x1.a4c89ep-21,
- 0x1.8cb738p-21,
- 0x1.75fa8ep-21,
- 0x1.608078p-21,
- 0x1.4c37c0p-21,
- 0x1.39100ep-21,
- 0x1.26f9e0p-21,
- 0x1.15e682p-21,
- 0x1.05c804p-21,
- 0x1.ed2254p-22,
- 0x1.d06ad6p-22,
- 0x1.b551c8p-22,
- 0x1.9bc0a0p-22,
- 0x1.83a200p-22,
- 0x1.6ce1aap-22,
- 0x1.576c72p-22,
- 0x1.43302cp-22,
- 0x1.301ba2p-22,
- 0x1.1e1e86p-22,
- 0x1.0d2966p-22,
- 0x1.fa5b50p-23,
- 0x1.dc3ae4p-23,
- 0x1.bfd756p-23,
- 0x1.a517dap-23,
- 0x1.8be4f8p-23,
- 0x1.74287ep-23,
- 0x1.5dcd66p-23,
- 0x1.48bfd4p-23,
- 0x1.34ecf8p-23,
- 0x1.224310p-23,
- 0x1.10b148p-23,
- },
-};
diff --git a/pl/math/sv_exp10f_1u5.c b/pl/math/sv_exp10f_1u5.c
deleted file mode 100644
index 9ecde8f1aa52..000000000000
--- a/pl/math/sv_exp10f_1u5.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Single-precision SVE 2^x function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "include/mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
-
-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
- FEXPA. */
-#define SpecialBound 37.9
-
-static const struct data
-{
- float poly[5];
- float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
-} data = {
- /* Coefficients generated using Remez algorithm with minimisation of relative
- error.
- rel error: 0x1.89dafa3p-24
- abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
- maxerr: 0.52 +0.5 ulp. */
- .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
- 0x1.12b41ap-1f },
- /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
- .shift = 0x1.903f8p17f,
- .log10_2 = 0x1.a934fp+1,
- .log2_10_hi = 0x1.344136p-2,
- .log2_10_lo = -0x1.ec10cp-27,
- .special_bound = SpecialBound,
-};
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (exp10f, x, y, special);
-}
-
-/* Single-precision SVE exp10f routine. Implements the same algorithm
- as AdvSIMD exp10f.
- Worst case error is 1.02 ULPs.
- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
- want 0x1.ba5f9cp-1. */
-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
- /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
- with poly(r) in [1/sqrt(2), sqrt(2)] and
- x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
-
- /* Load some constants in quad-word chunks to minimise memory access (last
- lane is wasted). */
- svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
-
- /* n = round(x/(log10(2)/N)). */
- svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
- svfloat32_t n = svsub_x (pg, z, shift);
-
- /* r = x - n*log10(2)/N. */
- svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
- r = svmls_lane (r, n, log10_2_and_inv, 2);
-
- svbool_t special = svacgt (pg, x, d->special_bound);
- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
-
- /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t poly
- = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
- sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
-
- return svmla_x (pg, scale, scale, poly);
-}
-
-PL_SIG (SV, F, 1, exp10, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp10), 0.52)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, SpecialBound, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), SpecialBound, inf, 50000)
diff --git a/pl/math/sv_exp2f_1u6.c b/pl/math/sv_exp2f_1u6.c
deleted file mode 100644
index 9698ff6f0682..000000000000
--- a/pl/math/sv_exp2f_1u6.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Single-precision SVE 2^x function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "poly_sve_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- float poly[5];
- float shift, thres;
-} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. */
- .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
- 0x1.59977ap-10f },
- /* 1.5*2^17 + 127. */
- .shift = 0x1.903f8p17f,
- /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
- correctly by FEXPA. */
- .thres = 0x1.5d5e2ap+6f,
-};
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (exp2f, x, y, special);
-}
-
-/* Single-precision SVE exp2f routine. Implements the same algorithm
- as AdvSIMD exp2f.
- Worst case error is 1.04 ULPs.
- SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
- want 0x1.ba7ebp+0. */
-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
- svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svadd_x (pg, x, shift);
- svfloat32_t n = svsub_x (pg, z, shift);
- svfloat32_t r = svsub_x (pg, x, n);
-
- svbool_t special = svacgt (pg, x, d->thres);
- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
-
- /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
- Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
- coefficients 1 to 4, and apply most significant coefficient directly. */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
- svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
-
- return svmla_x (pg, scale, scale, poly);
-}
-
-PL_SIG (SV, F, 1, exp2, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, -0x1p-23, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p-23, -1, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, -0x1p23, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p23, -inf, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, ScaleThres, 40000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -1, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, ScaleThres, 50000)
-PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -inf, 50000)
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
deleted file mode 100644
index 93d705ce420a..000000000000
--- a/pl/math/sv_expf_2u.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- float poly[5];
- float inv_ln2, ln2_hi, ln2_lo, shift, thres;
-} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. */
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
- 0x1.0e4020p-7f },
- .inv_ln2 = 0x1.715476p+0f,
- .ln2_hi = 0x1.62e4p-1f,
- .ln2_lo = 0x1.7f7d1cp-20f,
- /* 1.5*2^17 + 127. */
- .shift = 0x1.903f8p17f,
- /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
- correctly by FEXPA. */
- .thres = 0x1.5d5e2ap+6f,
-};
-
-#define C(i) sv_f32 (d->poly[i])
-#define ExponentBias 0x3f800000
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (expf, x, y, special);
-}
-
-/* Optimised single-precision SVE exp function.
- Worst-case error is 1.04 ulp:
- SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
- want 0x1.ba74bap+4. */
-svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-
- /* Load some constants in quad-word chunks to minimise memory access (last
- lane is wasted). */
- svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
-
- /* n = round(x/(ln2/N)). */
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
- svfloat32_t n = svsub_x (pg, z, d->shift);
-
- /* r = x - n*ln2/N. */
- svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
- r = svmls_lane (r, n, invln2_and_ln2, 2);
-
- /* scale = 2^(n/N). */
- svbool_t is_special_case = svacgt (pg, x, d->thres);
- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
-
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
- svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_x (pg, r, C (0));
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
- if (unlikely (svptest_any (pg, is_special_case)))
- return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
-
- return svmla_x (pg, scale, scale, poly);
-}
-
-PL_SIG (SV, F, 1, exp, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp), 0.55)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p23, inf, 50000)
diff --git a/pl/math/sv_expf_inline.h b/pl/math/sv_expf_inline.h
deleted file mode 100644
index 0ef4e0fda946..000000000000
--- a/pl/math/sv_expf_inline.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * SVE helper for single-precision routines which calculate exp(x) and do
- * not need special-case handling
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_SV_EXPF_INLINE_H
-#define PL_MATH_SV_EXPF_INLINE_H
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-struct sv_expf_data
-{
- float poly[5];
- float inv_ln2, ln2_hi, ln2_lo, shift;
-};
-
-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
-#define SV_EXPF_DATA \
- { \
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
- 0x1.0e4020p-7f }, \
- \
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
- .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
- }
-
-#define C(i) sv_f32 (d->poly[i])
-
-static inline svfloat32_t
-expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
-{
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-
- /* Load some constants in quad-word chunks to minimise memory access. */
- svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
-
- /* n = round(x/(ln2/N)). */
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
- svfloat32_t n = svsub_x (pg, z, d->shift);
-
- /* r = x - n*ln2/N. */
- svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
- r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
-
- /* scale = 2^(n/N). */
- svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
-
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
- svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
- svfloat32_t r2 = svmul_f32_x (pg, r, r);
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
- return svmla_x (pg, scale, scale, poly);
-}
-
-#endif // PL_MATH_SV_EXPF_INLINE_H \ No newline at end of file
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
deleted file mode 100644
index f55e068fd442..000000000000
--- a/pl/math/sv_log10_2u5.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Double-precision SVE log10(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
-
-#define Min 0x0010000000000000
-#define Max 0x7ff0000000000000
-#define Thres 0x7fe0000000000000 /* Max - Min. */
-#define Off 0x3fe6900900000000
-#define N (1 << V_LOG10_TABLE_BITS)
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
- return sv_call_f64 (log10, x, y, special);
-}
-
-/* SVE log10 algorithm.
- Maximum measured error is 2.46 ulps.
- SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
- want 0x1.fffbdf6eaa667p-6. */
-svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
-{
- svuint64_t ix = svreinterpret_u64 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
-
- /* x = 2^k z; where z is in range [Off,2*Off) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
- svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
- i = svand_x (pg, i, (N - 1) << 1);
- svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
- svfloat64_t z = svreinterpret_f64 (
- svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)));
-
- /* log(x) = k*log(2) + log(c) + log(z/c). */
- svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i);
- svfloat64_t logc
- = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i);
-
- /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1):
- r = z/c - 1 (we look up precomputed 1/c)
- log(z/c) ~= P(r). */
- svfloat64_t r = svmad_x (pg, invc, z, -1.0);
-
- /* hi = log(c) + k*log(2). */
- svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10);
- svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2);
-
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
- special);
- return svmla_x (pg, hi, r2, y);
-}
-
-PL_SIG (SV, D, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (SV_NAME_D1 (log10), 1.97)
-PL_TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000)
diff --git a/pl/math/sv_log1pf_1u3.c b/pl/math/sv_log1pf_1u3.c
deleted file mode 100644
index ea1a3dbf723a..000000000000
--- a/pl/math/sv_log1pf_1u3.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Single-precision vector log(x + 1) function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f32.h"
-
-static const struct data
-{
- float poly[8];
- float ln2, exp_bias;
- uint32_t four, three_quarters;
-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
- this can be fmov-ed directly instead of including it in
- the main load-and-mla polynomial schedule. */
- 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
- 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
- .ln2 = 0x1.62e43p-1f,
- .exp_bias = 0x1p-23f,
- .four = 0x40800000,
- .three_quarters = 0x3f400000};
-
-#define SignExponentMask 0xff800000
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (log1pf, x, y, special);
-}
-
-/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
- error is 1.27 ULP very close to 0.5.
- _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2
- want 0x1.9f323ep-2. */
-svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
- /* x < -1, Inf/Nan. */
- svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
- special = svorn_z (pg, special, svcmpge (pg, x, -1));
-
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
- svfloat32_t m = svadd_x (pg, x, 1);
-
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- svint32_t k
- = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
- sv_s32 (SignExponentMask));
-
- /* Scale x by exponent manipulation. */
- svfloat32_t m_scale = svreinterpret_f32 (
- svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
-
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
- m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
-
- /* Evaluate polynomial on reduced interval. */
- svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
- ms4 = svmul_x (pg, ms2, ms2);
- svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
- p = svmad_x (pg, m_scale, p, -0.5);
- p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
-
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
-
- /* Apply the scaling back. */
- svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
-
- return y;
-}
-
-PL_SIG (SV, F, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (log1p), 0.77)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000)
-PL_TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10)
diff --git a/pl/math/sv_log1pf_inline.h b/pl/math/sv_log1pf_inline.h
deleted file mode 100644
index d13b094f6b5d..000000000000
--- a/pl/math/sv_log1pf_inline.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Helper for SVE routines which calculate log(1 + x) and do not
- * need special-case handling
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_SV_LOG1PF_INLINE_H
-#define PL_MATH_SV_LOG1PF_INLINE_H
-
-#include "v_math.h"
-#include "math_config.h"
-#include "poly_sve_f32.h"
-
-static const struct sv_log1pf_data
-{
- float32_t poly[9];
- float32_t ln2;
- float32_t scale_back;
-} sv_log1pf_data = {
- /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
- .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
- -0x1.6f0d5ep-5f },
- .scale_back = 0x1.0p-23f,
- .ln2 = 0x1.62e43p-1f,
-};
-
-static inline svfloat32_t
-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
-{
- svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
- svfloat32_t m2 = svmul_x (pg, m, m);
- svfloat32_t q = svmla_x (pg, m, m2, p_12);
- svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
- p = svmul_x (pg, m2, p);
-
- return svmla_x (pg, q, m2, p);
-}
-
-static inline svfloat32_t
-sv_log1pf_inline (svfloat32_t x, svbool_t pg)
-{
- const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
-
- svfloat32_t m = svadd_x (pg, x, 1.0f);
-
- svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
- svreinterpret_s32 (svdup_f32 (0.75f)));
- ks = svand_x (pg, ks, 0xff800000);
- svuint32_t k = svreinterpret_u32 (ks);
- svfloat32_t s = svreinterpret_f32 (
- svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
-
- svfloat32_t m_scale
- = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
- m_scale
- = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
- svfloat32_t p = eval_poly (m_scale, d->poly, pg);
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
- return svmla_x (pg, p, scale_back, d->ln2);
-}
-
-#endif // PL_MATH_SV_LOG1PF_INLINE_H \ No newline at end of file
diff --git a/pl/math/sv_log2_3u.c b/pl/math/sv_log2_3u.c
deleted file mode 100644
index 0775a39cc85d..000000000000
--- a/pl/math/sv_log2_3u.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Double-precision SVE log2 function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_sve_f64.h"
-
-#define N (1 << V_LOG2_TABLE_BITS)
-#define Off 0x3fe6900900000000
-#define Max (0x7ff0000000000000)
-#define Min (0x0010000000000000)
-#define Thresh (0x7fe0000000000000) /* Max - Min. */
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
-{
- return sv_call_f64 (log2, x, y, cmp);
-}
-
-/* Double-precision SVE log2 routine.
- Implements the same algorithm as AdvSIMD log10, with coefficients and table
- entries scaled in extended precision.
- The maximum observed error is 2.58 ULP:
- SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
- want 0x1.fffb34198d9ddp-5. */
-svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
-{
- svuint64_t ix = svreinterpret_u64 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
-
- /* x = 2^k z; where z is in range [Off,2*Off) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
- svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
- i = svand_x (pg, i, (N - 1) << 1);
- svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
- svfloat64_t z = svreinterpret_f64 (
- svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)));
-
- svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i);
- svfloat64_t log2c
- = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i);
-
- /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
-
- svfloat64_t r = svmad_x (pg, invc, z, -1.0);
- svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2);
-
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly);
- w = svadd_x (pg, k, w);
-
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y),
- special);
- return svmla_x (pg, w, r2, y);
-}
-
-PL_SIG (SV, D, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (SV_NAME_D1 (log2), 2.09)
-PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_D1 (log2))
-PL_TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000)
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
deleted file mode 100644
index 2530c9e3f62c..000000000000
--- a/pl/math/sv_log_2u5.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Double-precision SVE log(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define P(i) sv_f64 (__v_log_data.poly[i])
-#define N (1 << V_LOG_TABLE_BITS)
-#define Off (0x3fe6900900000000)
-#define MaxTop (0x7ff)
-#define MinTop (0x001)
-#define ThreshTop (0x7fe) /* MaxTop - MinTop. */
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
-{
- return sv_call_f64 (log, x, y, cmp);
-}
-
-/* SVE port of AdvSIMD log algorithm.
- Maximum measured error is 2.17 ulp:
- SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
- want 0x1.ffffff1cca045p-2. */
-svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
-{
- svuint64_t ix = svreinterpret_u64 (x);
- svuint64_t top = svlsr_x (pg, ix, 52);
- svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
-
- /* x = 2^k z; where z is in range [Off,2*Off) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
- /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
- The actual value of i is double this due to table layout. */
- svuint64_t i
- = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
- svint64_t k
- = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
- svfloat64_t z = svreinterpret_f64 (iz);
- /* Lookup in 2 global lists (length N). */
- svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
- svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- svfloat64_t r = svmad_x (pg, invc, z, -1);
- svfloat64_t kd = svcvt_f64_x (pg, k);
- /* hi = r + log(c) + k*Ln2. */
- svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = svmla_x (pg, P (2), r, P (3));
- svfloat64_t p = svmla_x (pg, P (0), r, P (1));
- y = svmla_x (pg, y, r2, P (4));
- y = svmla_x (pg, p, r2, y);
-
- if (unlikely (svptest_any (pg, cmp)))
- return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
- return svmla_x (pg, hi, r2, y);
-}
-
-PL_SIG (SV, D, 1, log, 0.01, 11.1)
-PL_TEST_ULP (SV_NAME_D1 (log), 1.68)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000)
-PL_TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000)
diff --git a/pl/math/sv_tan_3u5.c b/pl/math/sv_tan_3u5.c
deleted file mode 100644
index 746396e98a10..000000000000
--- a/pl/math/sv_tan_3u5.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Double-precision SVE tan(x) function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "poly_sve_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- double poly[9];
- double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift;
-} data = {
- /* Polynomial generated with FPMinimax. */
- .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
- 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
- 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11,
- 0x1.4e4fd14147622p-12, },
- .half_pi_hi = 0x1.921fb54442d18p0,
- .half_pi_lo = 0x1.1a62633145c07p-54,
- .inv_half_pi = 0x1.45f306dc9c883p-1,
- .range_val = 0x1p23,
- .shift = 0x1.8p52,
-};
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
- return sv_call_f64 (tan, x, y, special);
-}
-
-/* Vector approximation for double-precision tan.
- Maximum measured error is 3.48 ULP:
- _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
- want -0x1.f6ccd8ecf7deap+37. */
-svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
-{
- const struct data *dat = ptr_barrier (&data);
-
- /* Invert condition to catch NaNs and Infs as well as large values. */
- svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
-
- /* q = nearest integer to 2 * x / pi. */
- svfloat64_t shift = sv_f64 (dat->shift);
- svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi);
- q = svsub_x (pg, q, shift);
- svint64_t qi = svcvt_s64_x (pg, q);
-
- /* Use q to reduce x to r in [-pi/4, pi/4], by:
- r = x - q * pi/2, in extended precision. */
- svfloat64_t r = x;
- svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi);
- r = svmls_lane (r, q, half_pi, 0);
- r = svmls_lane (r, q, half_pi, 1);
- /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
- formula. */
- r = svmul_x (pg, r, 0.5);
-
- /* Approximate tan(r) using order 8 polynomial.
- tan(x) is odd, so polynomial has the form:
- tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
- Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
- Then compute the approximation by:
- tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t r4 = svmul_x (pg, r2, r2);
- svfloat64_t r8 = svmul_x (pg, r4, r4);
- /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
- svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1);
- p = svmad_x (pg, p, r2, dat->poly[0]);
- p = svmla_x (pg, r, r2, svmul_x (pg, p, r));
-
- /* Recombination uses double-angle formula:
- tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
- and reciprocity around pi/2:
- tan(x) = 1 / (tan(pi/2 - x))
- to assemble result using change-of-sign and conditional selection of
- numerator/denominator dependent on odd/even-ness of q (hence quadrant). */
- svbool_t use_recip
- = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0);
-
- svfloat64_t n = svmad_x (pg, p, p, -1);
- svfloat64_t d = svmul_x (pg, p, 2);
- svfloat64_t swap = n;
- n = svneg_m (n, use_recip, d);
- d = svsel (use_recip, swap, d);
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special);
- return svdiv_x (pg, n, d);
-}
-
-PL_SIG (SV, D, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (SV_NAME_D1 (tan), 2.99)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000)
-PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000)
diff --git a/pl/math/sv_tanhf_2u6.c b/pl/math/sv_tanhf_2u6.c
deleted file mode 100644
index 988a56de0b2e..000000000000
--- a/pl/math/sv_tanhf_2u6.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Single-precision SVE tanh(x) function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#include "sv_expm1f_inline.h"
-
-static const struct data
-{
- struct sv_expm1f_data expm1f_consts;
- uint32_t boring_bound, onef;
-} data = {
- .expm1f_consts = SV_EXPM1F_DATA,
- /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
- .boring_bound = 0x41102cb3,
- .onef = 0x3f800000,
-};
-
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (tanhf, x, y, special);
-}
-
-/* Approximation for single-precision SVE tanh(x), using a simplified
- version of expm1f. The maximum error is 2.57 ULP:
- _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
- want 0x1.fb71aap-5. */
-svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
-
- svfloat32_t ax = svabs_x (pg, x);
- svuint32_t iax = svreinterpret_u32 (ax);
- svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
- svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
- svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
-
- svbool_t special = svcmpgt (pg, iax, 0x7f800000);
-
- /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
- svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
- if (unlikely (svptest_any (pg, special)))
- return special_case (x, svsel_f32 (is_boring, boring, y), special);
- return svsel_f32 (is_boring, boring, y);
-}
-
-PL_SIG (SV, F, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (SV_NAME_F1 (tanh), 2.07)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
-PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
deleted file mode 100644
index f2710a979d40..000000000000
--- a/pl/math/test/mathbench_funcs.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// clang-format off
-/*
- * Function entries for mathbench.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#define _ZSF1(fun, a, b) F(fun##f, a, b)
-#define _ZSD1(f, a, b) D(f, a, b)
-
-#if defined(__vpcs) && __aarch64__
-
-#define _ZVF1(fun, a, b) VNF(_ZGVnN4v_##fun##f, a, b)
-#define _ZVD1(f, a, b) VND(_ZGVnN2v_##f, a, b)
-
-#else
-
-#define _ZVF1(f, a, b)
-#define _ZVD1(f, a, b)
-
-#endif
-
-#if WANT_SVE_MATH
-
-#define _ZSVF1(fun, a, b) SVF(_ZGVsMxv_##fun##f, a, b)
-#define _ZSVD1(f, a, b) SVD(_ZGVsMxv_##f, a, b)
-
-#else
-
-#define _ZSVF1(f, a, b)
-#define _ZSVD1(f, a, b)
-
-#endif
-
-/* No auto-generated wrappers for binary functions - they have be
- manually defined in mathbench_wrappers.h. We have to define silent
- macros for them anyway as they will be emitted by PL_SIG. */
-#define _ZSF2(...)
-#define _ZSD2(...)
-#define _ZVF2(...)
-#define _ZVD2(...)
-#define _ZSVF2(...)
-#define _ZSVD2(...)
-
-#include "mathbench_funcs_gen.h"
-
-/* PL_SIG only emits entries for unary functions, since if a function
- needs to be wrapped in mathbench there is no way for it to know the
- same of the wrapper. Add entries for binary functions, or any other
- exotic signatures that need wrapping, below. */
-
-{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
-{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
-{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
-
-{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
-{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
-{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}},
-{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}},
-{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}},
-{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}},
-{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}},
-{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}},
-{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}},
-{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}},
-{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}},
-
-#if WANT_SVE_MATH
-{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
-{"_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
-{"_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_hypotf_wrap}},
-{"_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_hypot_wrap}},
-{"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
-{"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
-{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}},
-{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}},
-{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}},
-{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}},
-{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}},
-{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}},
-{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}},
-{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}},
-{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}},
-{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}},
-#endif
- // clang-format on
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
deleted file mode 100644
index fe7f8963cdee..000000000000
--- a/pl/math/test/mathbench_wrappers.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Function wrappers for mathbench.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-static double
-atan2_wrap (double x)
-{
- return atan2 (5.0, x);
-}
-
-static float
-atan2f_wrap (float x)
-{
- return atan2f (5.0f, x);
-}
-
-static double
-powi_wrap (double x)
-{
- return __builtin_powi (x, (int) round (x));
-}
-
-#if __aarch64__ && defined(__vpcs)
-
-__vpcs static v_double
-_Z_atan2_wrap (v_double x)
-{
- return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
-}
-
-__vpcs static v_float
-_Z_atan2f_wrap (v_float x)
-{
- return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
-}
-
-__vpcs static v_float
-_Z_hypotf_wrap (v_float x)
-{
- return _ZGVnN4vv_hypotf (v_float_dup (5.0f), x);
-}
-
-__vpcs static v_double
-_Z_hypot_wrap (v_double x)
-{
- return _ZGVnN2vv_hypot (v_double_dup (5.0), x);
-}
-
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
- return _ZGVnN2vv_pow (x, x);
-}
-
-__vpcs static v_double
-x_Z_pow (v_double x)
-{
- return _ZGVnN2vv_pow (x, v_double_dup (23.4));
-}
-
-__vpcs static v_double
-y_Z_pow (v_double x)
-{
- return _ZGVnN2vv_pow (v_double_dup (2.34), x);
-}
-
-__vpcs static v_float
-_Z_sincosf_wrap (v_float x)
-{
- v_float s, c;
- _ZGVnN4vl4l4_sincosf (x, &s, &c);
- return s + c;
-}
-
-__vpcs static v_float
-_Z_cexpif_wrap (v_float x)
-{
- __f32x4x2_t sc = _ZGVnN4v_cexpif (x);
- return sc.val[0] + sc.val[1];
-}
-
-__vpcs static v_double
-_Z_sincos_wrap (v_double x)
-{
- v_double s, c;
- _ZGVnN2vl8l8_sincos (x, &s, &c);
- return s + c;
-}
-
-__vpcs static v_double
-_Z_cexpi_wrap (v_double x)
-{
- __f64x2x2_t sc = _ZGVnN2v_cexpi (x);
- return sc.val[0] + sc.val[1];
-}
-
-#endif // __arch64__ && __vpcs
-
-#if WANT_SVE_MATH
-
-static sv_float
-_Z_sv_atan2f_wrap (sv_float x, sv_bool pg)
-{
- return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg);
-}
-
-static sv_double
-_Z_sv_atan2_wrap (sv_double x, sv_bool pg)
-{
- return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg);
-}
-
-static sv_float
-_Z_sv_hypotf_wrap (sv_float x, sv_bool pg)
-{
- return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg);
-}
-
-static sv_double
-_Z_sv_hypot_wrap (sv_double x, sv_bool pg)
-{
- return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg);
-}
-
-static sv_float
-_Z_sv_powi_wrap (sv_float x, sv_bool pg)
-{
- return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg);
-}
-
-static sv_double
-_Z_sv_powk_wrap (sv_double x, sv_bool pg)
-{
- return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
-}
-
-static sv_float
-xy_Z_sv_powf (sv_float x, sv_bool pg)
-{
- return _ZGVsMxvv_powf (x, x, pg);
-}
-
-static sv_float
-x_Z_sv_powf (sv_float x, sv_bool pg)
-{
- return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg);
-}
-
-static sv_float
-y_Z_sv_powf (sv_float x, sv_bool pg)
-{
- return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg);
-}
-
-static sv_double
-xy_Z_sv_pow (sv_double x, sv_bool pg)
-{
- return _ZGVsMxvv_pow (x, x, pg);
-}
-
-static sv_double
-x_Z_sv_pow (sv_double x, sv_bool pg)
-{
- return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg);
-}
-
-static sv_double
-y_Z_sv_pow (sv_double x, sv_bool pg)
-{
- return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg);
-}
-
-static sv_float
-_Z_sv_sincosf_wrap (sv_float x, sv_bool pg)
-{
- float s[svcntw ()], c[svcntw ()];
- _ZGVsMxvl4l4_sincosf (x, s, c, pg);
- return svadd_x (pg, svld1 (pg, s), svld1 (pg, s));
-}
-
-static sv_float
-_Z_sv_cexpif_wrap (sv_float x, sv_bool pg)
-{
- svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg);
- return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1));
-}
-
-static sv_double
-_Z_sv_sincos_wrap (sv_double x, sv_bool pg)
-{
- double s[svcntd ()], c[svcntd ()];
- _ZGVsMxvl8l8_sincos (x, s, c, pg);
- return svadd_x (pg, svld1 (pg, s), svld1 (pg, s));
-}
-
-static sv_double
-_Z_sv_cexpi_wrap (sv_double x, sv_bool pg)
-{
- svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg);
- return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1));
-}
-
-#endif // WANT_SVE_MATH
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
deleted file mode 100644
index e7ed4eed634e..000000000000
--- a/pl/math/test/pl_test.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * PL macros for emitting various details about routines for consumption by
- * runulp.sh.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
- */
-
-/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV
- on PL_TEST_ULP to add EXPECT_FENV to all scalar routines. */
-#if WANT_VMATH || defined(IGNORE_SCALAR_FENV)
-# define PL_TEST_ULP(f, l) PL_TEST_ULP f l
-#else
-# define PL_TEST_ULP(f, l) \
- PL_TEST_EXPECT_FENV_ALWAYS (f) \
- PL_TEST_ULP f l
-#endif
-
-/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
- exceptions. e allows declaration to be emitted conditionally upon certain
- build flags - defer expansion by one pass to allow those flags to be expanded
- properly. */
-#define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e)
-#define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f)
-#define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f
-#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1)
-
-#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n
-#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) \
- PL_TEST_INTERVAL (f, lo, hi, n) \
- PL_TEST_INTERVAL (f, -lo, -hi, n)
-#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c
-#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) \
- PL_TEST_INTERVAL_C (f, lo, hi, n, c) \
- PL_TEST_INTERVAL_C (f, -lo, -hi, n, c)
-// clang-format off
-#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \
- PL_TEST_INTERVAL f xlo,ylo xhi,yhi n
-// clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
deleted file mode 100755
index 0f5a41f76b25..000000000000
--- a/pl/math/test/runulp.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-
-# ULP error check script.
-#
-# Copyright (c) 2019-2023, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-#set -x
-set -eu
-
-# cd to bin directory.
-cd "${0%/*}"
-
-flags="${ULPFLAGS:--q}"
-emu="$@"
-
-# Enable SVE testing
-WANT_SVE_MATH=${WANT_SVE_MATH:-0}
-
-FAIL=0
-PASS=0
-
-t() {
- routine=$1
- L=$(cat $LIMITS | grep "^$routine " | awk '{print $2}')
- [[ $L =~ ^[0-9]+\.[0-9]+$ ]]
- extra_flags=
- [[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5"
- grep -q "^$routine$" $FENV || extra_flags="$extra_flags -f"
- IFS=',' read -ra LO <<< "$2"
- IFS=',' read -ra HI <<< "$3"
- ITV="${LO[0]} ${HI[0]}"
- for i in "${!LO[@]}"; do
- [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}"
- done
- # Add -z flag to ignore zero sign for vector routines
- { echo $routine | grep -q "ZGV"; } && extra_flags="$extra_flags -z"
- $emu ./ulp -e $L $flags ${extra_flags} $routine $ITV $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
-}
-
-check() {
- $emu ./ulp -f -q "$@" #>/dev/null
-}
-
-if [ "$FUNC" == "atan2" ] || [ -z "$FUNC" ]; then
- # Regression-test for correct NaN handling in atan2
- check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
- check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
- check atan2 nan nan x -nan -nan
-fi
-
-# vector functions
-flags="${ULPFLAGS:--q}"
-runsv=
-if [ $WANT_SVE_MATH -eq 1 ]; then
-# No guarantees about powi accuracy, so regression-test for exactness
-# w.r.t. the custom reference impl in ulp_wrappers.h
-check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000 && runsv=1
-check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000 && runsv=1
-fi
-
-while read F LO HI N C
-do
- t $F $LO $HI $N $C
-done << EOF
-$(cat $INTERVALS | grep "\b$FUNC\b")
-EOF
-
-[ 0 -eq $FAIL ] || {
- echo "FAILED $FAIL PASSED $PASS"
- exit 1
-}
diff --git a/pl/math/test/testcases/directed/erff.tst b/pl/math/test/testcases/directed/erff.tst
deleted file mode 100644
index 9b1d3d5114ae..000000000000
--- a/pl/math/test/testcases/directed/erff.tst
+++ /dev/null
@@ -1,17 +0,0 @@
-; erff.tst
-;
-; Copyright (c) 2007-2023, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-func=erff op1=7fc00001 result=7fc00001 errno=0
-func=erff op1=ffc00001 result=7fc00001 errno=0
-func=erff op1=7f800001 result=7fc00001 errno=0 status=i
-func=erff op1=ff800001 result=7fc00001 errno=0 status=i
-func=erff op1=7f800000 result=3f800000 errno=0
-func=erff op1=ff800000 result=bf800000 errno=0
-func=erff op1=00000000 result=00000000 errno=ERANGE
-func=erff op1=80000000 result=80000000 errno=ERANGE
-func=erff op1=00000001 result=00000001 errno=0 status=ux
-func=erff op1=80000001 result=80000001 errno=0 status=ux
-func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
-func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/pl/math/test/testcases/directed/log2.tst b/pl/math/test/testcases/directed/log2.tst
deleted file mode 100644
index 5d1eb9b877e8..000000000000
--- a/pl/math/test/testcases/directed/log2.tst
+++ /dev/null
@@ -1,21 +0,0 @@
-; Directed test cases for log2
-;
-; Copyright (c) 2018-2023, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
-func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
-func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
-func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
-func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0
-func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i
-func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0
-func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
-func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
-func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z
-func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0
-func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i
-func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0
-func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0
diff --git a/pl/math/test/testcases/directed/log2f.tst b/pl/math/test/testcases/directed/log2f.tst
deleted file mode 100644
index 4e08110878d6..000000000000
--- a/pl/math/test/testcases/directed/log2f.tst
+++ /dev/null
@@ -1,27 +0,0 @@
-; log2f.tst - Directed test cases for log2f
-;
-; Copyright (c) 2017-2023, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-func=log2f op1=7fc00001 result=7fc00001 errno=0
-func=log2f op1=ffc00001 result=7fc00001 errno=0
-func=log2f op1=7f800001 result=7fc00001 errno=0 status=i
-func=log2f op1=ff800001 result=7fc00001 errno=0 status=i
-func=log2f op1=ff810000 result=7fc00001 errno=0 status=i
-func=log2f op1=7f800000 result=7f800000 errno=0
-func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i
-func=log2f op1=3f800000 result=00000000 errno=0
-func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z
-func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z
-func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i
-
-func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0
-func=log2f op1=3f604189 result=be4394c8.395 error=0
-func=log2f op1=3f278034 result=bf1caa73.88e error=0
-func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0
-func=log2f op1=3e61259a result=c00bdb95.650 error=0
-func=log2f op1=3f8147ae result=3c6b3267.d6a error=0
-func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0
-func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0
-func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0
-func=log2f op1=40070838 result=3f89e055.a0a error=0
diff --git a/pl/math/test/testcases/random/double.tst b/pl/math/test/testcases/random/double.tst
deleted file mode 100644
index d83283ef7864..000000000000
--- a/pl/math/test/testcases/random/double.tst
+++ /dev/null
@@ -1,6 +0,0 @@
-!! double.tst - Random test case specification for DP functions
-!!
-!! Copyright (c) 1999-2023, Arm Limited.
-!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-test log10 10000
diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst
deleted file mode 100644
index fa77efecfabb..000000000000
--- a/pl/math/test/testcases/random/float.tst
+++ /dev/null
@@ -1,8 +0,0 @@
-!! float.tst - Random test case specification for SP functions
-!!
-!! Copyright (c) 2022-2023, Arm Limited.
-!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-test erff 10000
-test log10f 10000
-test tanf 10000
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
deleted file mode 100644
index 4929b481ffe1..000000000000
--- a/pl/math/test/ulp_funcs.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Function entries for ulp.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#if defined(__vpcs) && __aarch64__
-
-#define _ZVF1(f) ZVF1 (f)
-#define _ZVD1(f) ZVD1 (f)
-#define _ZVF2(f) ZVF2 (f)
-#define _ZVD2(f) ZVD2 (f)
-
-#else
-
-#define _ZVF1(f)
-#define _ZVD1(f)
-#define _ZVF2(f)
-#define _ZVD2(f)
-
-#endif
-
-#if WANT_SVE_MATH
-
-#define _ZSVF1(f) ZSVF1 (f)
-#define _ZSVF2(f) ZSVF2 (f)
-#define _ZSVD1(f) ZSVD1 (f)
-#define _ZSVD2(f) ZSVD2 (f)
-
-#else
-
-#define _ZSVF1(f)
-#define _ZSVF2(f)
-#define _ZSVD1(f)
-#define _ZSVD2(f)
-
-#endif
-
-#define _ZSF1(f) F1 (f)
-#define _ZSF2(f) F2 (f)
-#define _ZSD1(f) D1 (f)
-#define _ZSD2(f) D2 (f)
-
-#include "ulp_funcs_gen.h"
-
-F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0)
-F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0)
-F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0)
-F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0)
-
-F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0)
-F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0)
-F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0)
-F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0)
-
-#if WANT_SVE_MATH
-F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
-F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
-
-F (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0)
-F (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0)
-F (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0)
-F (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0)
-
-F (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0)
-F (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0)
-F (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0)
-F (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0)
-#endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
deleted file mode 100644
index 0f7b68949c7b..000000000000
--- a/pl/math/test/ulp_wrappers.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// clang-format off
-/*
- * Function wrappers for ulp.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#define _GNU_SOURCE
-#include <stdbool.h>
-#include <arm_neon.h>
-
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
- mpfr_cos(y, x, r);
- return mpfr_sin(y, x, r);
-}
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
- mpfr_sin(y, x, r);
- return mpfr_cos(y, x, r);
-}
-static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) {
- mpfr_t y2;
- mpfr_init(y2);
- mpfr_trunc(y2, y);
- return mpfr_pow(ret, x, y2, rnd);
-}
-#endif
-
-/* Our implementations of powi/powk are too imprecise to verify
- against any established pow implementation. Instead we have the
- following simple implementation, against which it is enough to
- maintain bitwise reproducibility. Note the test framework expects
- the reference impl to be of higher precision than the function
- under test. For instance this means that the reference for
- double-precision powi will be passed a long double, so to check
- bitwise reproducibility we have to cast it back down to
- double. This is fine since a round-trip to higher precision and
- back down is correctly rounded. */
-#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \
- static DBL_T __attribute__((unused)) NAME (DBL_T in_val, DBL_T y) \
- { \
- INT_T n = (INT_T) round (y); \
- FLT_T acc = 1.0; \
- bool want_recip = n < 0; \
- n = n < 0 ? -n : n; \
- \
- for (FLT_T c = in_val; n; c *= c, n >>= 1) \
- { \
- if (n & 0x1) \
- { \
- acc *= c; \
- } \
- } \
- if (want_recip) \
- { \
- acc = 1.0 / acc; \
- } \
- return acc; \
- }
-
-DECL_POW_INT_REF(ref_powif, double, float, int)
-DECL_POW_INT_REF(ref_powi, long double, double, int)
-
-#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; }
-#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; }
-#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; }
-#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; }
-
-#if defined(__vpcs) && __aarch64__
-
-#define ZVNF1_WRAP(func) ZVF1_WRAP(func)
-#define ZVNF2_WRAP(func) ZVF2_WRAP(func)
-#define ZVND1_WRAP(func) ZVD1_WRAP(func)
-#define ZVND2_WRAP(func) ZVD2_WRAP(func)
-
-#else
-
-#define ZVNF1_WRAP(func)
-#define ZVNF2_WRAP(func)
-#define ZVND1_WRAP(func)
-#define ZVND2_WRAP(func)
-
-#endif
-
-#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); }
-#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); }
-#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); }
-#define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); }
-
-#if WANT_SVE_MATH
-
-#define ZSVNF1_WRAP(func) ZSVF1_WRAP(func)
-#define ZSVNF2_WRAP(func) ZSVF2_WRAP(func)
-#define ZSVND1_WRAP(func) ZSVD1_WRAP(func)
-#define ZSVND2_WRAP(func) ZSVD2_WRAP(func)
-
-#else
-
-#define ZSVNF1_WRAP(func)
-#define ZSVNF2_WRAP(func)
-#define ZSVND1_WRAP(func)
-#define ZSVND2_WRAP(func)
-
-#endif
-
-/* No wrappers for scalar routines, but PL_SIG will emit them. */
-#define ZSNF1_WRAP(func)
-#define ZSNF2_WRAP(func)
-#define ZSND1_WRAP(func)
-#define ZSND2_WRAP(func)
-
-#include "ulp_wrappers_gen.h"
-
-float v_sincosf_sin(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return s[0]; }
-float v_sincosf_cos(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return c[0]; }
-float v_cexpif_sin(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[0][0]; }
-float v_cexpif_cos(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[1][0]; }
-
-double v_sincos_sin(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return s[0]; }
-double v_sincos_cos(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return c[0]; }
-double v_cexpi_sin(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[0][0]; }
-double v_cexpi_cos(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[1][0]; }
-
-#if WANT_SVE_MATH
-static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_s32((int)round(y)), svptrue_b32())); }
-static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_s64((long)round(y)), svptrue_b64())); }
-
-float sv_sincosf_sin(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return s[0]; }
-float sv_sincosf_cos(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return c[0]; }
-float sv_cexpif_sin(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 0)); }
-float sv_cexpif_cos(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 1)); }
-
-double sv_sincos_sin(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return s[0]; }
-double sv_sincos_cos(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return c[0]; }
-double sv_cexpi_sin(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 0)); }
-double sv_cexpi_cos(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 1)); }
-
-#endif
-// clang-format on
diff --git a/pl/math/trigpi_references.c b/pl/math/trigpi_references.c
deleted file mode 100644
index 4b0514b6766a..000000000000
--- a/pl/math/trigpi_references.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Extended precision scalar reference functions for trigpi.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#define _GNU_SOURCE
-#include "math_config.h"
-#include "mathlib.h"
-
-long double
-sinpil (long double x)
-{
- /* sin(inf) should return nan, as defined by C23. */
- if (isinf (x))
- return __math_invalid (x);
-
- long double ax = fabsl (x);
-
- /* Return 0 for all values above 2^64 to prevent
- overflow when casting to uint64_t. */
- if (ax >= 0x1p64)
- return 0;
-
- /* All integer cases should return 0. */
- if (ax == (uint64_t) ax)
- return 0;
-
- return sinl (x * M_PIl);
-}
-
-long double
-cospil (long double x)
-{
- /* cos(inf) should return nan, as defined by C23. */
- if (isinf (x))
- return __math_invalid (x);
-
- long double ax = fabsl (x);
-
- if (ax >= 0x1p64)
- return 1;
-
- uint64_t m = (uint64_t) ax;
-
- /* Integer values of cospi(x) should return +/-1.
- The sign depends on if x is odd or even. */
- if (m == ax)
- return (m & 1) ? -1 : 1;
-
- /* Values of Integer + 0.5 should always return 0. */
- if (ax - 0.5 == m || ax + 0.5 == m)
- return 0;
-
- return cosl (ax * M_PIl);
-} \ No newline at end of file
diff --git a/pl/math/v_asinh_3u5.c b/pl/math/v_asinh_3u5.c
deleted file mode 100644
index 4862bef94861..000000000000
--- a/pl/math/v_asinh_3u5.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Double-precision vector asinh(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define A(i) v_f64 (__v_log_data.poly[i])
-#define N (1 << V_LOG_TABLE_BITS)
-
-const static struct data
-{
- float64x2_t poly[18];
- uint64x2_t off, huge_bound, abs_mask;
- float64x2_t ln2, tiny_bound;
-} data = {
- .off = V2 (0x3fe6900900000000),
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .huge_bound = V2 (0x5fe0000000000000),
- .tiny_bound = V2 (0x1p-26),
- .abs_mask = V2 (0x7fffffffffffffff),
- /* Even terms of polynomial s.t. asinh(x) is approximated by
- asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
- Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
- .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
- V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
- V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
- V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
- V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
- V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
- V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
- V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
- V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
-};
-
-static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (asinh, x, y, special);
-}
-
-struct entry
-{
- float64x2_t invc;
- float64x2_t logc;
-};
-
-static inline struct entry
-lookup (uint64x2_t i)
-{
- float64x2_t e0 = vld1q_f64 (
- &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
- float64x2_t e1 = vld1q_f64 (
- &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
- return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
-}
-
-static inline float64x2_t
-log_inline (float64x2_t x, const struct data *d)
-{
- /* Double-precision vector log, copied from ordinary vector log with some
- cosmetic modification and special-cases removed. */
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint64x2_t tmp = vsubq_u64 (ix, d->off);
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
- uint64x2_t iz
- = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
- float64x2_t z = vreinterpretq_f64_u64 (iz);
- struct entry e = lookup (tmp);
- float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
- float64x2_t kd = vcvtq_f64_s64 (k);
- float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
- float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t y = vfmaq_f64 (A (2), A (3), r);
- float64x2_t p = vfmaq_f64 (A (0), A (1), r);
- y = vfmaq_f64 (y, A (4), r2);
- y = vfmaq_f64 (p, y, r2);
- y = vfmaq_f64 (hi, y, r2);
- return y;
-}
-
-/* Double-precision implementation of vector asinh(x).
- asinh is very sensitive around 1, so it is impractical to devise a single
- low-cost algorithm which is sufficiently accurate on a wide range of input.
- Instead we use two different algorithms:
- asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
- = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
- where log(x) is an optimized log approximation, and P(x) is a polynomial
- shared with the scalar routine. The greatest observed error 3.29 ULP, in
- |x| >= 1:
- __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
- want 0x1.ffffcfd0e2352p-1. */
-VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- float64x2_t ax = vabsq_f64 (x);
- uint64x2_t iax = vreinterpretq_u64_f64 (ax);
-
- uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
- uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
-
-#if WANT_SIMD_EXCEPT
- uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
- special = vorrq_u64 (special, tiny);
-#endif
-
- /* Option 1: |x| >= 1.
- Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
- If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
- overflow, by setting special lanes to 1. These will be fixed later. */
- float64x2_t option_1 = v_f64 (0);
- if (likely (v_any_u64 (gt1)))
- {
-#if WANT_SIMD_EXCEPT
- float64x2_t xm = v_zerofy_f64 (ax, special);
-#else
- float64x2_t xm = ax;
-#endif
- option_1 = log_inline (
- vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
- }
-
- /* Option 2: |x| < 1.
- Compute asinh(x) using a polynomial.
- If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
- overflow, and tiny lanes, which will underflow, by setting them to 0. They
- will be fixed later, either by selecting x or falling back to the scalar
- special-case. The largest observed error in this region is 1.47 ULPs:
- __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
- want 0x1.c1d6bf874019cp-1. */
- float64x2_t option_2 = v_f64 (0);
- if (likely (v_any_u64 (vceqzq_u64 (gt1))))
- {
-#if WANT_SIMD_EXCEPT
- ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
-#endif
- float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
- z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
- z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
- float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
- option_2 = vfmaq_f64 (ax, p, x3);
-#if WANT_SIMD_EXCEPT
- option_2 = vbslq_f64 (tiny, x, option_2);
-#endif
- }
-
- /* Choose the right option for each lane. */
- float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
- /* Copy sign. */
- y = vbslq_f64 (d->abs_mask, y, x);
-
- if (unlikely (v_any_u64 (special)))
- return special_case (x, y, special);
- return y;
-}
-
-PL_SIG (V, D, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (asinh), 2.80)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
-/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
- Ensures the v_sel is choosing the right option in all cases. */
-#define V_ASINH_INTERVAL(lo, hi, n) \
- PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0.5) \
- PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 2) \
- PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0x1p600)
-V_ASINH_INTERVAL (0, 0x1p-26, 50000)
-V_ASINH_INTERVAL (0x1p-26, 1, 50000)
-V_ASINH_INTERVAL (1, 0x1p511, 50000)
-V_ASINH_INTERVAL (0x1p511, inf, 40000)
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
deleted file mode 100644
index 1723ba90d2f3..000000000000
--- a/pl/math/v_asinhf_2u7.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Single-precision vector asinh(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "v_log1pf_inline.h"
-
-#define SignMask v_u32 (0x80000000)
-
-const static struct data
-{
- struct v_log1pf_data log1pf_consts;
- uint32x4_t big_bound;
-#if WANT_SIMD_EXCEPT
- uint32x4_t tiny_bound;
-#endif
-} data = {
- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
- .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
-#if WANT_SIMD_EXCEPT
- .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
-#endif
-};
-
-static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
- return v_call_f32 (asinhf, x, y, special);
-}
-
-/* Single-precision implementation of vector asinh(x), using vector log1p.
- Worst-case error is 2.66 ULP, at roughly +/-0.25:
- __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
-VPCS_ATTR float32x4_t V_NAME_F1 (asinh) (float32x4_t x)
-{
- const struct data *dat = ptr_barrier (&data);
- uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
- float32x4_t ax = vreinterpretq_f32_u32 (iax);
- uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
- float32x4_t special_arg = x;
-
-#if WANT_SIMD_EXCEPT
- /* Sidestep tiny and large values to avoid inadvertently triggering
- under/overflow. */
- special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
- if (unlikely (v_any_u32 (special)))
- {
- ax = v_zerofy_f32 (ax, special);
- x = v_zerofy_f32 (x, special);
- }
-#endif
-
- /* asinh(x) = log(x + sqrt(x * x + 1)).
- For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
- float32x4_t d
- = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
- float32x4_t y = log1pf_inline (
- vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
-
- if (unlikely (v_any_u32 (special)))
- return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
- return vbslq_f32 (SignMask, x, y);
-}
-
-PL_SIG (V, F, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_F1 (asinh), 2.17)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
-PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
deleted file mode 100644
index f24667682dec..000000000000
--- a/pl/math/v_atan2_3u.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Double-precision vector atan2(x) function.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
-
-static const struct data
-{
- float64x2_t pi_over_2;
- float64x2_t poly[20];
-} data = {
- /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
- the interval [2**-1022, 1.0]. */
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
- .pi_over_2 = V2 (0x1.921fb54442d18p+0),
-};
-
-#define SignMask v_u64 (0x8000000000000000)
-
-/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
-{
- return v_call2_f64 (atan2, y, x, ret, cmp);
-}
-
-/* Returns 1 if input is the bit representation of 0, infinity or nan. */
-static inline uint64x2_t
-zeroinfnan (uint64x2_t i)
-{
- /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
- return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
- v_u64 (2 * asuint64 (INFINITY) - 1));
-}
-
-/* Fast implementation of vector atan2.
- Maximum observed error is 2.8 ulps:
- _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
- got 0x1.92d628ab678ccp-1
- want 0x1.92d628ab678cfp-1. */
-float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
-{
- const struct data *data_ptr = ptr_barrier (&data);
-
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint64x2_t iy = vreinterpretq_u64_f64 (y);
-
- uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
-
- uint64x2_t sign_x = vandq_u64 (ix, SignMask);
- uint64x2_t sign_y = vandq_u64 (iy, SignMask);
- uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
-
- float64x2_t ax = vabsq_f64 (x);
- float64x2_t ay = vabsq_f64 (y);
-
- uint64x2_t pred_xlt0 = vcltzq_f64 (x);
- uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
-
- /* Set up z for call to atan. */
- float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
- float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
- float64x2_t z = vdivq_f64 (n, d);
-
- /* Work out the correct shift. */
- float64x2_t shift = vreinterpretq_f64_u64 (
- vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
- shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
- shift = vmulq_f64 (shift, data_ptr->pi_over_2);
-
- /* Calculate the polynomial approximation.
- Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
- full scheme to avoid underflow in x^16.
- The order 19 polynomial P approximates
- (atan(sqrt(x))-sqrt(x))/x^(3/2). */
- float64x2_t z2 = vmulq_f64 (z, z);
- float64x2_t x2 = vmulq_f64 (z2, z2);
- float64x2_t x4 = vmulq_f64 (x2, x2);
- float64x2_t x8 = vmulq_f64 (x4, x4);
- float64x2_t ret
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
- v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
-
- /* Finalize. y = shift + z + z^3 * P(z^2). */
- ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
- ret = vaddq_f64 (ret, shift);
-
- /* Account for the sign of x and y. */
- ret = vreinterpretq_f64_u64 (
- veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
-
- if (unlikely (v_any_u64 (special_cases)))
- return special_case (y, x, ret, special_cases);
-
- return ret;
-}
-
-/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
-PL_SIG (V, D, 2, atan2)
-// TODO tighten this once __v_atan2 is fixed
-PL_TEST_ULP (V_NAME_D2 (atan2), 2.9)
-PL_TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
diff --git a/pl/math/v_exp_data.c b/pl/math/v_exp_data.c
deleted file mode 100644
index fd01cf27606f..000000000000
--- a/pl/math/v_exp_data.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Scale values for vector exp and exp2
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* 2^(j/N), j=0..N, N=2^7=128. Copied from math/v_exp_data.c. */
-const uint64_t __v_exp_data[] = {
- 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
- 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
- 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
- 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
- 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
- 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
- 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
- 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
- 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
- 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
- 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
- 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
- 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
- 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
- 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
- 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
- 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
- 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
- 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
- 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
- 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
- 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
- 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
- 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
- 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
- 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
- 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
- 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
- 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
- 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
- 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
- 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
- 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
- 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
- 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
- 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
- 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
- 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
- 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
- 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
- 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
- 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
- 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
-};
diff --git a/pl/math/v_exp_tail.h b/pl/math/v_exp_tail.h
deleted file mode 100644
index 903f1fd95717..000000000000
--- a/pl/math/v_exp_tail.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Constants for double-precision e^(x+tail) vector function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#define C1_scal 0x1.fffffffffffd4p-2
-#define C2_scal 0x1.5555571d6b68cp-3
-#define C3_scal 0x1.5555576a59599p-5
-#define InvLn2_scal 0x1.71547652b82fep8 /* N/ln2. */
-#define Ln2hi_scal 0x1.62e42fefa39efp-9 /* ln2/N. */
-#define Ln2lo_scal 0x1.abc9e3b39803f3p-64
-
-#define N (1 << V_EXP_TAIL_TABLE_BITS)
-#define Tab __v_exp_tail_data
-#define IndexMask_scal (N - 1)
-#define Shift_scal 0x1.8p+52
-#define Thres_scal 704.0
diff --git a/pl/math/v_exp_tail_inline.h b/pl/math/v_exp_tail_inline.h
deleted file mode 100644
index 76ecc6b0a33a..000000000000
--- a/pl/math/v_exp_tail_inline.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Double-precision vector e^(x+tail) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#ifndef PL_MATH_V_EXP_TAIL_INLINE_H
-#define PL_MATH_V_EXP_TAIL_INLINE_H
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-
-#ifndef WANT_V_EXP_TAIL_SPECIALCASE
-#error \
- "Cannot use v_exp_tail_inline.h without specifying whether you need the special case computation."
-#endif
-
-#define N (1 << V_EXP_TAIL_TABLE_BITS)
-
-static const struct data
-{
- float64x2_t poly[4];
-#if WANT_V_EXP_TAIL_SPECIALCASE
- float64x2_t big_bound, huge_bound;
-#endif
- float64x2_t shift, invln2, ln2_hi, ln2_lo;
-} data = {
-#if WANT_V_EXP_TAIL_SPECIALCASE
- .big_bound = V2 (704.0),
- .huge_bound = V2 (1280.0 * N),
-#endif
- .shift = V2 (0x1.8p52),
- .invln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
- .ln2_hi = V2 (0x1.62e42fefa39efp-9), /* ln2/N. */
- .ln2_lo = V2 (0x1.abc9e3b39803f3p-64),
- .poly = { V2 (1.0), V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
- V2 (0x1.5555576a59599p-5) },
-};
-
-static inline uint64x2_t
-lookup_sbits (uint64x2_t i)
-{
- return (uint64x2_t){__v_exp_tail_data[i[0]], __v_exp_tail_data[i[1]]};
-}
-
-#if WANT_V_EXP_TAIL_SPECIALCASE
-#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
-/* The following 2 bias when combined form the exponent bias:
- SpecialBias1 - SpecialBias2 = asuint64(1.0). */
-#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
-#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
-static float64x2_t VPCS_ATTR
-v_exp_tail_special_case (float64x2_t s, float64x2_t y, float64x2_t n,
- const struct data *d)
-{
- /* 2^(n/N) may overflow, break it up into s1*s2. */
- uint64x2_t b = vandq_u64 (vclezq_f64 (n), SpecialOffset);
- float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
- float64x2_t s2 = vreinterpretq_f64_u64 (
- vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
- uint64x2_t oflow = vcagtq_f64 (n, d->huge_bound);
- float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
- float64x2_t r1 = vmulq_f64 (s1, s1);
- return vbslq_f64 (oflow, r1, r0);
-}
-#endif
-
-static inline float64x2_t VPCS_ATTR
-v_exp_tail_inline (float64x2_t x, float64x2_t xtail)
-{
- const struct data *d = ptr_barrier (&data);
-#if WANT_V_EXP_TAIL_SPECIALCASE
- uint64x2_t special = vcgtq_f64 (vabsq_f64 (x), d->big_bound);
-#endif
- /* n = round(x/(ln2/N)). */
- float64x2_t z = vfmaq_f64 (d->shift, x, d->invln2);
- uint64x2_t u = vreinterpretq_u64_f64 (z);
- float64x2_t n = vsubq_f64 (z, d->shift);
-
- /* r = x - n*ln2/N. */
- float64x2_t r = x;
- r = vfmsq_f64 (r, d->ln2_hi, n);
- r = vfmsq_f64 (r, d->ln2_lo, n);
-
- uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
- uint64x2_t i = vandq_u64 (u, v_u64 (N - 1));
-
- /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4, using Horner. */
- float64x2_t y = v_horner_3_f64 (r, d->poly);
- y = vfmaq_f64 (xtail, y, r);
-
- /* s = 2^(n/N). */
- u = lookup_sbits (i);
- float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
-
-#if WANT_V_EXP_TAIL_SPECIALCASE
- if (unlikely (v_any_u64 (special)))
- return v_exp_tail_special_case (s, y, n, d);
-#endif
- return vfmaq_f64 (s, y, s);
-}
-#endif // PL_MATH_V_EXP_TAIL_INLINE_H
diff --git a/pl/math/v_expf_inline.h b/pl/math/v_expf_inline.h
deleted file mode 100644
index 166683726b4d..000000000000
--- a/pl/math/v_expf_inline.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Helper for single-precision routines which calculate exp(x) and do not
- * need special-case handling
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_V_EXPF_INLINE_H
-#define PL_MATH_V_EXPF_INLINE_H
-
-#include "v_math.h"
-
-struct v_expf_data
-{
- float32x4_t poly[5];
- float32x4_t shift, invln2_and_ln2;
-};
-
-/* maxerr: 1.45358 +0.5 ulp. */
-#define V_EXPF_DATA \
- { \
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
- .shift = V4 (0x1.8p23f), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
- }
-
-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
-#define C(i) d->poly[i]
-
-static inline float32x4_t
-v_expf_inline (float32x4_t x, const struct v_expf_data *d)
-{
- /* Helper routine for calculating exp(x).
- Copied from v_expf.c, with all special-case handling removed - the
- calling routine should handle special values if required. */
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- float32x4_t n, r, z;
- z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
- n = vsubq_f32 (z, d->shift);
- r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
- r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
-
- /* Custom order-4 Estrin avoids building high order monomial. */
- float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t p, q, poly;
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
- q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
- return vfmaq_f32 (scale, poly, scale);
-}
-
-#endif // PL_MATH_V_EXPF_INLINE_H
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
deleted file mode 100644
index dd255472cec0..000000000000
--- a/pl/math/v_expm1_2u5.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Double-precision vector exp(x) - 1 function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- float64x2_t poly[11];
- float64x2_t invln2, ln2, shift;
- int64x2_t exponent_bias;
-#if WANT_SIMD_EXCEPT
- uint64x2_t thresh, tiny_bound;
-#else
- float64x2_t oflow_bound;
-#endif
-} data = {
- /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
- .invln2 = V2 (0x1.71547652b82fep0),
- .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
- .shift = V2 (0x1.8p52),
- .exponent_bias = V2 (0x3ff0000000000000),
-#if WANT_SIMD_EXCEPT
- /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
- compare. */
- .thresh = V2 (0x78c56fa6d34b552),
- /* asuint64(0x1p-51) << 1. */
- .tiny_bound = V2 (0x3cc0000000000000 << 1),
-#else
- /* Value above which expm1(x) should overflow. Absolute value of the
- underflow bound is greater than this, so it catches both cases - there is
- a small window where fallbacks are triggered unnecessarily. */
- .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
-#endif
-};
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (expm1, x, y, special);
-}
-
-/* Double-precision vector exp(x) - 1 function.
- The maximum error observed error is 2.18 ULP:
- _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
- want 0x1.a8b9ea8d66e2p-2. */
-float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
-
-#if WANT_SIMD_EXCEPT
- /* If fp exceptions are to be triggered correctly, fall back to scalar for
- |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
- shift-left by 1, and compare with thresh which was left-shifted offline -
- this is effectively an absolute compare. */
- uint64x2_t special
- = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
- if (unlikely (v_any_u64 (special)))
- x = v_zerofy_f64 (x, special);
-#else
- /* Large input, NaNs and Infs. */
- uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
-#endif
-
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
- int64x2_t i = vcvtq_s64_f64 (n);
- float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
- f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t f4 = vmulq_f64 (f2, f2);
- float64x2_t f8 = vmulq_f64 (f4, f4);
- float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
- float64x2_t t = vreinterpretq_f64_s64 (u);
-
- if (unlikely (v_any_u64 (special)))
- return special_case (vreinterpretq_f64_u64 (ix),
- vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
- special);
-
- /* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
-}
-
-PL_SIG (V, D, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_D1 (expm1), 1.68)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
deleted file mode 100644
index 6b282d0cc00f..000000000000
--- a/pl/math/v_expm1f_1u6.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Single-precision vector exp(x) - 1 function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- float32x4_t poly[5];
- float32x4_t invln2_and_ln2;
- float32x4_t shift;
- int32x4_t exponent_bias;
-#if WANT_SIMD_EXCEPT
- uint32x4_t thresh;
-#else
- float32x4_t oflow_bound;
-#endif
-} data = {
- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
- .shift = V4 (0x1.8p23f),
- .exponent_bias = V4 (0x3f800000),
-#if !WANT_SIMD_EXCEPT
- /* Value above which expm1f(x) should overflow. Absolute value of the
- underflow bound is greater than this, so it catches both cases - there is
- a small window where fallbacks are triggered unnecessarily. */
- .oflow_bound = V4 (0x1.5ebc4p+6),
-#else
- /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
- compare. */
- .thresh = V4 (0x1d5ebc40),
-#endif
-};
-
-/* asuint(0x1p-23), shifted by 1 for abs compare. */
-#define TinyBound v_u32 (0x34000000 << 1)
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
- return v_call_f32 (expm1f, x, y, special);
-}
-
-/* Single-precision vector exp(x) - 1 function.
- The maximum error is 1.51 ULP:
- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
- want 0x1.e2fb94p-2. */
-float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
-
-#if WANT_SIMD_EXCEPT
- /* If fp exceptions are to be triggered correctly, fall back to scalar for
- |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
- shift-left by 1, and compare with thresh which was left-shifted offline -
- this is effectively an absolute compare. */
- uint32x4_t special
- = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
- if (unlikely (v_any_u32 (special)))
- x = v_zerofy_f32 (x, special);
-#else
- /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
- uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
-#endif
-
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- float32x4_t j = vsubq_f32 (
- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
- int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float32x4_t p = v_horner_4_f32 (f, d->poly);
- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
- float32x4_t t = vreinterpretq_f32_s32 (u);
-
- if (unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
- special);
-
- /* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
-}
-
-PL_SIG (V, F, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (V_NAME_F1 (expm1), 1.02)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
-PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
-PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
-PL_TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
-PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h
deleted file mode 100644
index 6ae94c452de2..000000000000
--- a/pl/math/v_expm1f_inline.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Helper for single-precision routines which calculate exp(x) - 1 and do not
- * need special-case handling
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_V_EXPM1F_INLINE_H
-#define PL_MATH_V_EXPM1F_INLINE_H
-
-#include "v_math.h"
-#include "math_config.h"
-#include "poly_advsimd_f32.h"
-
-struct v_expm1f_data
-{
- float32x4_t poly[5];
- float32x4_t invln2_and_ln2, shift;
- int32x4_t exponent_bias;
-};
-
-/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
- log(2)/2]. Exponent bias is asuint(1.0f).
- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
-#define V_EXPM1F_DATA \
- { \
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
- }
-
-static inline float32x4_t
-expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
-{
- /* Helper routine for calculating exp(x) - 1.
- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
- calling routine should handle special values if required. */
-
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float32x4_t j = vsubq_f32 (
- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
- int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
-
- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
- Horner. */
- float32x4_t f2 = vmulq_f32 (f, f);
- float32x4_t f4 = vmulq_f32 (f2, f2);
- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
- p = vfmaq_f32 (f, f2, p);
-
- /* t = 2^i. */
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
- float32x4_t t = vreinterpretq_f32_s32 (u);
- /* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
-}
-
-#endif // PL_MATH_V_EXPM1F_INLINE_H
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
deleted file mode 100644
index 35dd62fe5e3e..000000000000
--- a/pl/math/v_log10_2u5.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Double-precision vector log10(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
-
-#define N (1 << V_LOG10_TABLE_BITS)
-
-static const struct data
-{
- uint64x2_t min_norm;
- uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t invln10, log10_2, ln2;
- uint64x2_t sign_exp_mask;
-} data = {
- /* Computed from log coefficients divided by log(10) then rounded to double
- precision. */
- .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3),
- V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4),
- V2 (-0x1.287461742fee4p-4) },
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .invln10 = V2 (0x1.bcb7b1526e50ep-2),
- .log10_2 = V2 (0x1.34413509f79ffp-2),
- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
- .sign_exp_mask = V2 (0xfff0000000000000),
-};
-
-#define Off v_u64 (0x3fe6900900000000)
-#define IndexMask (N - 1)
-
-#define T(s, i) __v_log10_data.s[i]
-
-struct entry
-{
- float64x2_t invc;
- float64x2_t log10c;
-};
-
-static inline struct entry
-lookup (uint64x2_t i)
-{
- struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
- float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
- float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
- e.invc = vuzp1q_f64 (e0, e1);
- e.log10c = vuzp2q_f64 (e0, e1);
- return e;
-}
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
- uint32x2_t special)
-{
- return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special));
-}
-
-/* Fast implementation of double-precision vector log10
- is a slight modification of double-precision vector log.
- Max ULP error: < 2.5 ulp (nearest rounding.)
- Maximum measured at 2.46 ulp for x in [0.96, 0.97]
- _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
- want 0x1.fff6be3cae4b9p-6. */
-float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
-
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- uint64x2_t tmp = vsubq_u64 (ix, Off);
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
- float64x2_t z = vreinterpretq_f64_u64 (iz);
-
- struct entry e = lookup (tmp);
-
- /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
- float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
- float64x2_t kd = vcvtq_f64_s64 (k);
-
- /* hi = r / log(10) + log10(c) + k*log10(2).
- Constants in v_log10_data.c are computed (in extended precision) as
- e.log10c := e.logc * ivln10. */
- float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10);
-
- /* y = log10(1+r) + n * log10(2). */
- float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2);
-
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
-
- if (unlikely (v_any_u32h (special)))
- return special_case (x, y, hi, r2, special);
- return vfmaq_f64 (hi, r2, y);
-}
-
-PL_SIG (V, D, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (V_NAME_D1 (log10), 1.97)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log10))
-PL_TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
-PL_TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
-PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
deleted file mode 100644
index 92bc50ba5bd9..000000000000
--- a/pl/math/v_log10f_3u5.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Single-precision vector log10 function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- uint32x4_t min_norm;
- uint16x8_t special_bound;
- float32x4_t poly[8];
- float32x4_t inv_ln10, ln2;
- uint32x4_t off, mantissa_mask;
-} data = {
- /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
- [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
- .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
- V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
- V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .inv_ln10 = V4 (0x1.bcb7b2p-2f),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
- .off = V4 (0x3f2aaaab), /* 0.666667. */
- .mantissa_mask = V4 (0x007fffff),
-};
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
- uint16x4_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
-}
-
-/* Fast implementation of AdvSIMD log10f,
- uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
- an order 9 polynomial.
- Maximum error: 3.305ulps (nearest rounding.)
- _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
- want 0x1.ffe2f4p-4. */
-float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
-
- /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
- float32x4_t n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
- float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
-
- /* y = log10(1+r) + n * log10(2). */
- float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
- /* y = Log10(2) * n + poly * InvLn(10). */
- float32x4_t y = vfmaq_f32 (r, d->ln2, n);
- y = vmulq_f32 (y, d->inv_ln10);
-
- if (unlikely (v_any_u16h (special)))
- return special_case (x, y, poly, r2, special);
- return vfmaq_f32 (y, poly, r2);
-}
-
-PL_SIG (V, F, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (V_NAME_F1 (log10), 2.81)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log10))
-PL_TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
-PL_TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
-PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
deleted file mode 100644
index face02ddc6c3..000000000000
--- a/pl/math/v_log1p_2u5.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Double-precision vector log(1+x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-const static struct data
-{
- float64x2_t poly[19], ln2[2];
- uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
- int64x2_t one_top;
-} data = {
- /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
- V2 (-0x1.cfa7385bdb37ep-6) },
- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
- /* top32(asuint64(sqrt(2)/2)) << 32. */
- .hf_rt2_top = V2 (0x3fe6a09e00000000),
- /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
- .one_m_hf_rt2_top = V2 (0x00095f6200000000),
- .umask = V2 (0x000fffff00000000),
- .one_top = V2 (0x3ff),
- .inf = V2 (0x7ff0000000000000),
- .minus_one = V2 (0xbff0000000000000)
-};
-
-#define BottomMask v_u64 (0xffffffff)
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (log1p, x, y, special);
-}
-
-/* Vector log1p approximation using polynomial on reduced interval. Routine is
- a modification of the algorithm used in scalar log1p, with no shortcut for
- k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
- _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
- want 0x1.fd61d0727429fp+2 . */
-VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
- uint64x2_t special = vcgeq_u64 (ia, d->inf);
-
-#if WANT_SIMD_EXCEPT
- special = vorrq_u64 (special,
- vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
- if (unlikely (v_any_u64 (special)))
- x = v_zerofy_f64 (x, special);
-#else
- special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
-#endif
-
- /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
- is in [sqrt(2)/2, sqrt(2)]):
- log1p(x) = k*log(2) + log1p(f).
-
- f may not be representable exactly, so we need a correction term:
- let m = round(1 + x), c = (1 + x) - m.
- c << m: at very small x, log1p(x) ~ x, hence:
- log(1+x) - log(m) ~ c/m.
-
- We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
-
- /* Obtain correctly scaled k by manipulation in the exponent.
- The scalar algorithm casts down to 32-bit at this point to calculate k and
- u_red. We stay in double-width to obtain f and k, using the same constants
- as the scalar algorithm but shifted left by 32. */
- float64x2_t m = vaddq_f64 (x, v_f64 (1));
- uint64x2_t mi = vreinterpretq_u64_f64 (m);
- uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
-
- int64x2_t ki
- = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
- float64x2_t k = vcvtq_f64_s64 (ki);
-
- /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
- uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
- uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
-
- /* Correction term c/m. */
- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
-
- /* Approximate log1p(x) on the reduced input using a polynomial. Because
- log1p(0)=0 we choose an approximation of the form:
- x + C0*x^2 + C1*x^3 + C2x^4 + ...
- Hence approximation has the form f + f^2 * P(f)
- where P(x) = C0 + C1*x + C2x^2 + ...
- Assembling this all correctly is dealt with at the final step. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
-
- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
- float64x2_t y = vaddq_f64 (ylo, yhi);
-
- if (unlikely (v_any_u64 (special)))
- return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
- special);
-
- return vfmaq_f64 (y, f2, p);
-}
-
-PL_SIG (V, D, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (V_NAME_D1 (log1p), 1.97)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
-PL_TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
diff --git a/pl/math/v_log1p_inline.h b/pl/math/v_log1p_inline.h
deleted file mode 100644
index bd57bfc6fe6e..000000000000
--- a/pl/math/v_log1p_inline.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Helper for vector double-precision routines which calculate log(1 + x) and do
- * not need special-case handling
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#ifndef PL_MATH_V_LOG1P_INLINE_H
-#define PL_MATH_V_LOG1P_INLINE_H
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-
-struct v_log1p_data
-{
- float64x2_t poly[19], ln2[2];
- uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
- int64x2_t one_top;
-};
-
-/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
-#define V_LOG1P_CONSTANTS_TABLE \
- { \
- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
- V2 (-0x1.cfa7385bdb37ep-6) }, \
- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
- .hf_rt2_top = V2 (0x3fe6a09e00000000), \
- .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
- .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
- }
-
-#define BottomMask v_u64 (0xffffffff)
-
-static inline float64x2_t
-log1p_inline (float64x2_t x, const struct v_log1p_data *d)
-{
- /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
- modifications:
- - No special-case handling - this should be dealt with by the caller.
- - Pairwise Horner polynomial evaluation for improved accuracy.
- - Optionally simulate the shortcut for k=0, used in the scalar routine,
- using v_sel, for improved accuracy when the argument to log1p is close to
- 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
- the source of the caller before including this file.
- See v_log1pf_2u1.c for details of the algorithm. */
- float64x2_t m = vaddq_f64 (x, v_f64 (1));
- uint64x2_t mi = vreinterpretq_u64_f64 (m);
- uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
-
- int64x2_t ki
- = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
- float64x2_t k = vcvtq_f64_s64 (ki);
-
- /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
- uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
- uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
-
- /* Correction term c/m. */
- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
-
-#ifndef WANT_V_LOG1P_K0_SHORTCUT
-#error \
- "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
-#elif WANT_V_LOG1P_K0_SHORTCUT
- /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
- that the approximation is solely the polynomial. */
- uint64x2_t k0 = vceqzq_f64 (k);
- cm = v_zerofy_f64 (cm, k0);
- f = vbslq_f64 (k0, x, f);
-#endif
-
- /* Approximate log1p(f) on the reduced input using a polynomial. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
-
- /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
- return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
-}
-
-#endif // PL_MATH_V_LOG1P_INLINE_H
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
deleted file mode 100644
index 153c88da9c88..000000000000
--- a/pl/math/v_log1pf_2u1.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Single-precision vector log(1+x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f32.h"
-
-const static struct data
-{
- float32x4_t poly[8], ln2;
- uint32x4_t tiny_bound, minus_one, four, thresh;
- int32x4_t three_quarters;
-} data = {
- .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
- (1, -0.5) are not stored as they can be generated more
- efficiently. */
- V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
- .minus_one = V4 (0xbf800000),
- .four = V4 (0x40800000),
- .three_quarters = V4 (0x3f400000)
-};
-
-static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *p)
-{
- /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
- float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
- float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
- float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
- float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
-
- float32x4_t m2 = vmulq_f32 (m, m);
- float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
- float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
- float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
-
- float32x4_t m4 = vmulq_f32 (m2, m2);
- float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
- return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
-}
-
-static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
-{
- return v_call_f32 (log1pf, x, y, special);
-}
-
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
- is roughly 2.02 ULP:
- log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
-VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
- uint32x4_t special_cases
- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
- vcgeq_u32 (ix, d->minus_one));
- float32x4_t special_arg = x;
-
-#if WANT_SIMD_EXCEPT
- if (unlikely (v_any_u32 (special_cases)))
- /* Side-step special lanes so fenv exceptions are not triggered
- inadvertently. */
- x = v_zerofy_f32 (x, special_cases);
-#endif
-
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
-
- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
-
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
- v_s32 (0xff800000));
- uint32x4_t ku = vreinterpretq_u32_s32 (k);
-
- /* Scale x by exponent manipulation. */
- float32x4_t m_scale
- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
-
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
-
- /* Evaluate polynomial on the reduced interval. */
- float32x4_t p = eval_poly (m_scale, d->poly);
-
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
-
- /* Apply the scaling back. */
- float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
-
- if (unlikely (v_any_u32 (special_cases)))
- return special_case (special_arg, y, special_cases);
- return y;
-}
-
-PL_SIG (V, F, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (V_NAME_F1 (log1p), 1.53)
-PL_TEST_EXPECT_FENV (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
-PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
deleted file mode 100644
index c654c6bad08f..000000000000
--- a/pl/math/v_log1pf_inline.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Helper for single-precision routines which calculate log(1 + x) and do not
- * need special-case handling
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_V_LOG1PF_INLINE_H
-#define PL_MATH_V_LOG1PF_INLINE_H
-
-#include "v_math.h"
-#include "poly_advsimd_f32.h"
-
-struct v_log1pf_data
-{
- float32x4_t poly[8], ln2;
- uint32x4_t four;
- int32x4_t three_quarters;
-};
-
-/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
- (1, -0.5) are not stored as they can be generated more efficiently. */
-#define V_LOG1PF_CONSTANTS_TABLE \
- { \
- .poly \
- = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
- .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
- .three_quarters = V4 (0x3f400000) \
- }
-
-static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *c)
-{
- /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
- uses split Estrin, but this way reduces register pressure in the calling
- routine). */
- float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
- float32x4_t m2 = vmulq_f32 (m, m);
- q = vfmaq_f32 (m, m2, q);
- float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
- p = vmulq_f32 (m2, p);
- return vfmaq_f32 (q, m2, p);
-}
-
-static inline float32x4_t
-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
-{
- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
- special-case handling. See that file for details of the algorithm. */
- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
- int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
- v_s32 (0xff800000));
- uint32x4_t ku = vreinterpretq_u32_s32 (k);
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
- float32x4_t m_scale
- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
- float32x4_t p = eval_poly (m_scale, d.poly);
- float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
- return vfmaq_f32 (p, scale_back, d.ln2);
-}
-
-#endif // PL_MATH_V_LOG1PF_INLINE_H
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
deleted file mode 100644
index 2dd2c34b7c97..000000000000
--- a/pl/math/v_log2_3u.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Double-precision vector log2 function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-#include "poly_advsimd_f64.h"
-
-#define N (1 << V_LOG2_TABLE_BITS)
-
-static const struct data
-{
- uint64x2_t min_norm;
- uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t invln2;
- uint64x2_t sign_exp_mask;
-} data = {
- /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
- and N = 128, then scaled by log2(e) in extended precision and rounded back
- to double precision. */
- .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2),
- V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2),
- V2 (-0x1.ec738d616fe26p-3) },
- .invln2 = V2 (0x1.71547652b82fep0),
- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
- .sign_exp_mask = V2 (0xfff0000000000000),
-};
-
-#define Off v_u64 (0x3fe6900900000000)
-#define IndexMask (N - 1)
-
-struct entry
-{
- float64x2_t invc;
- float64x2_t log2c;
-};
-
-static inline struct entry
-lookup (uint64x2_t i)
-{
- struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
- float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
- float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
- e.invc = vuzp1q_f64 (e0, e1);
- e.log2c = vuzp2q_f64 (e0, e1);
- return e;
-}
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
- uint32x2_t special)
-{
- return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special));
-}
-
-/* Double-precision vector log2 routine. Implements the same algorithm as
- vector log10, with coefficients and table entries scaled in extended
- precision. The maximum observed error is 2.58 ULP:
- _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
- want 0x1.fffb34198d9ddp-5. */
-float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
-
- /* x = 2^k z; where z is in range [Off,2*Off) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- uint64x2_t tmp = vsubq_u64 (ix, Off);
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
- float64x2_t z = vreinterpretq_f64_u64 (iz);
-
- struct entry e = lookup (tmp);
-
- /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
-
- float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
- float64x2_t kd = vcvtq_f64_s64 (k);
- float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2);
-
- float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
- w = vaddq_f64 (kd, w);
-
- if (unlikely (v_any_u32h (special)))
- return special_case (x, y, w, r2, special);
- return vfmaq_f64 (w, r2, y);
-}
-
-PL_SIG (V, D, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (V_NAME_D1 (log2), 2.09)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log2))
-PL_TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c
deleted file mode 100644
index c64d88742136..000000000000
--- a/pl/math/v_log2f_2u5.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Single-precision vector log2 function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f32.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- uint32x4_t min_norm;
- uint16x8_t special_bound;
- uint32x4_t off, mantissa_mask;
- float32x4_t poly[9];
-} data = {
- /* Coefficients generated using Remez algorithm approximate
- log2(1+r)/r for r in [ -1/3, 1/3 ].
- rel error: 0x1.c4c4b0cp-26. */
- .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
- V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
- V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
- V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
- .off = V4 (0x3f2aaaab), /* 0.666667. */
- .mantissa_mask = V4 (0x007fffff),
-};
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
- uint16x4_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
-}
-
-/* Fast implementation for single precision AdvSIMD log2,
- relies on same argument reduction as AdvSIMD logf.
- Maximum error: 2.48 ULPs
- _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
- want 0x1.a9be8p-2. */
-float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
-{
- const struct data *d = ptr_barrier (&data);
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
-
- /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
- float32x4_t n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
- float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
-
- /* y = log2(1+r) + n. */
- float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
-
- if (unlikely (v_any_u16h (special)))
- return special_case (x, n, p, r, special);
- return vfmaq_f32 (n, p, r);
-}
-
-PL_SIG (V, F, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (V_NAME_F1 (log2), 1.99)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log2))
-PL_TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
diff --git a/pl/math/v_log_data.c b/pl/math/v_log_data.c
deleted file mode 100644
index a26e8a051d97..000000000000
--- a/pl/math/v_log_data.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Lookup table for double-precision log(x) vector function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-const struct v_log_data __v_log_data = {
- /* Worst-case error: 1.17 + 0.5 ulp.
- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
- .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
- 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 },
- .ln2 = 0x1.62e42fefa39efp-1,
- /* Algorithm:
-
- x = 2^k z
- log(x) = k ln2 + log(c) + poly(z/c - 1)
-
- where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
- N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
- tables:
-
- table[i].invc = 1/c
- table[i].logc = (double)log(c)
-
- where c is near the center of the subinterval and is chosen by trying
- several floating point invc candidates around 1/center and selecting one
- for which the error in (double)log(c) is minimized (< 0x1p-74), except the
- subinterval that contains 1 and the previous one got tweaked to avoid
- cancellation. */
- .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
- { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
- { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
- { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
- { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
- { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
- { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
- { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
- { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
- { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
- { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
- { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
- { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
- { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
- { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
- { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
- { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
- { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
- { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
- { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
- { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
- { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
- { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
- { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
- { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
- { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
- { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
- { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
- { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
- { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
- { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
- { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
- { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
- { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
- { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
- { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
- { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
- { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
- { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
- { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
- { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
- { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
- { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
- { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
- { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
- { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
- { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
- { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
- { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
- { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
- { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
- { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
- { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
- { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
- { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
- { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
- { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
- { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
- { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
- { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
- { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
- { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
- { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
- { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
- { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
- { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
- { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
- { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
- { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
- { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
- { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
- { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
- { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
- { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
- { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
- { 1.0, 0.0 },
- { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
- { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
- { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
- { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
- { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
- { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
- { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
- { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
- { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
- { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
- { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
- { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
- { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
- { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
- { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
- { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
- { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
- { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
- { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
- { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
- { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
- { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
- { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
- { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
- { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
- { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
- { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
- { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
- { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
- { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
- { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
- { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
- { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
- { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
- { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
- { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
- { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
- { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
- { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
- { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
- { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
- { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
- { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
- { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
- { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
- { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
- { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
- { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
- { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
- { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
- { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
- { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
-};
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
deleted file mode 100644
index a644f54b4a0f..000000000000
--- a/pl/math/v_sinh_3u.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Double-precision vector sinh(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- float64x2_t poly[11];
- float64x2_t inv_ln2, m_ln2, shift;
- uint64x2_t halff;
- int64x2_t onef;
-#if WANT_SIMD_EXCEPT
- uint64x2_t tiny_bound, thresh;
-#else
- uint64x2_t large_bound;
-#endif
-} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
-
- .inv_ln2 = V2 (0x1.71547652b82fep0),
- .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
- .shift = V2 (0x1.8p52),
-
- .halff = V2 (0x3fe0000000000000),
- .onef = V2 (0x3ff0000000000000),
-#if WANT_SIMD_EXCEPT
- /* 2^-26, below which sinh(x) rounds to x. */
- .tiny_bound = V2 (0x3e50000000000000),
- /* asuint(large_bound) - asuint(tiny_bound). */
- .thresh = V2 (0x0230000000000000),
-#else
-/* 2^9. expm1 helper overflows for large input. */
- .large_bound = V2 (0x4080000000000000),
-#endif
-};
-
-static inline float64x2_t
-expm1_inline (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- /* Reduce argument:
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where i = round(x / ln2)
- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
- float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
- int64x2_t i = vcvtq_s64_f64 (j);
- float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
- f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
- /* Approximate expm1(f) using polynomial. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t f4 = vmulq_f64 (f2, f2);
- float64x2_t f8 = vmulq_f64 (f4, f4);
- float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
- /* t = 2^i. */
- float64x2_t t = vreinterpretq_f64_u64 (
- vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
- /* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
-}
-
-static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x)
-{
- return v_call_f64 (sinh, x, x, v_u64 (-1));
-}
-
-/* Approximation for vector double-precision sinh(x) using expm1.
- sinh(x) = (exp(x) - exp(-x)) / 2.
- The greatest observed error is 2.57 ULP:
- _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
- want 0x1.ab34e59d678d9p-2. */
-float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- float64x2_t ax = vabsq_f64 (x);
- uint64x2_t sign
- = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
-
-#if WANT_SIMD_EXCEPT
- uint64x2_t special = vcgeq_u64 (
- vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
-#else
- uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
-#endif
-
- /* Fall back to scalar variant for all lanes if any of them are special. */
- if (unlikely (v_any_u64 (special)))
- return special_case (x);
-
- /* Up to the point that expm1 overflows, we can use it to calculate sinh
- using a slight rearrangement of the definition of sinh. This allows us to
- retain acceptable accuracy for very small inputs. */
- float64x2_t t = expm1_inline (ax);
- t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
- return vmulq_f64 (t, halfsign);
-}
-
-PL_SIG (V, D, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (sinh), 2.08)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
diff --git a/pl/math/v_tanh_3u.c b/pl/math/v_tanh_3u.c
deleted file mode 100644
index 5de85c68da2c..000000000000
--- a/pl/math/v_tanh_3u.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Double-precision vector tanh(x) function.
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
-#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-static const struct data
-{
- float64x2_t poly[11];
- float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
- uint64x2_t onef;
- uint64x2_t thresh, tiny_bound;
-} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
-
- .inv_ln2 = V2 (0x1.71547652b82fep0),
- .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
- .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
- .shift = V2 (0x1.8p52),
-
- .onef = V2 (0x3ff0000000000000),
- .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
- /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
- .thresh = V2 (0x01f241bf835f9d5f),
-};
-
-static inline float64x2_t
-expm1_inline (float64x2_t x, const struct data *d)
-{
- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
- the scalar variant of tanh. */
-
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
- int64x2_t i = vcvtq_s64_f64 (j);
- float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
- f = vfmaq_f64 (f, j, d->ln2_lo);
-
- /* Approximate expm1(f) using polynomial. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t f4 = vmulq_f64 (f2, f2);
- float64x2_t p = vfmaq_f64 (
- f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
-
- /* t = 2 ^ i. */
- float64x2_t t = vreinterpretq_f64_u64 (
- vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
- /* expm1(x) = p * t + (t - 1). */
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
-}
-
-static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
-{
- return v_call_f64 (tanh, x, y, special);
-}
-
-/* Vector approximation for double-precision tanh(x), using a simplified
- version of expm1. The greatest observed error is 2.77 ULP:
- _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
- want -0x1.bd6a21a163624p-3. */
-float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
-
- float64x2_t u = x;
-
- /* Trigger special-cases for tiny, boring and infinity/NaN. */
- uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
-#if WANT_SIMD_EXCEPT
- /* To trigger fp exceptions correctly, set special lanes to a neutral value.
- They will be fixed up later by the special-case handler. */
- if (unlikely (v_any_u64 (special)))
- u = v_zerofy_f64 (u, special);
-#endif
-
- u = vaddq_f64 (u, u);
-
- /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- float64x2_t q = expm1_inline (u, d);
- float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
-
- if (unlikely (v_any_u64 (special)))
- return special_case (x, vdivq_f64 (q, qp2), special);
- return vdivq_f64 (q, qp2);
-}
-
-PL_SIG (V, D, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME_D1 (tanh), 2.27)
-PL_TEST_EXPECT_FENV (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
-PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
diff --git a/string/Dir.mk b/string/Dir.mk
index 40ff5acc093e..dd8283ec4977 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -13,9 +13,12 @@ all-string bench-string check-string install-string clean-string:
else
string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
+string-lib-srcs += $(wildcard $(S)/$(ARCH)/experimental/*.[cS])
string-test-srcs := $(wildcard $(S)/test/*.c)
string-bench-srcs := $(wildcard $(S)/bench/*.c)
+string-arch-include-dir := $(wildcard $(S)/$(ARCH))
+string-arch-includes := $(wildcard $(S)/$(ARCH)/*.h)
string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
string-libs := \
@@ -43,6 +46,7 @@ string-tests := \
string-benches := \
build/bin/bench/memcpy \
+ build/bin/bench/memset \
build/bin/bench/strlen
string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
@@ -64,8 +68,8 @@ string-files := \
all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
-$(string-objs): $(string-includes)
-$(string-objs): CFLAGS_ALL += $(string-cflags)
+$(string-objs): $(string-includes) $(string-arch-includes)
+$(string-objs): CFLAGS_ALL += $(string-cflags) -I$(string-arch-include-dir)
$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
@@ -101,6 +105,7 @@ check-string: $(string-tests-out)
bench-string: $(string-benches)
$(EMULATOR) build/bin/bench/strlen
$(EMULATOR) build/bin/bench/memcpy
+ $(EMULATOR) build/bin/bench/memset
install-string: \
$(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 207e22950c6d..34b5789240da 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -27,9 +27,6 @@
#define zva_val x4
ENTRY (__mtag_tag_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
add dstend, dstin, count
cmp count, 96
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 44b8e0114f42..2fa248e25621 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -27,9 +27,6 @@
#define zva_val x4
ENTRY (__mtag_tag_zero_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
add dstend, dstin, count
cmp count, 96
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
index 131b95e1fea9..90166676977a 100644
--- a/string/aarch64/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -21,19 +21,6 @@
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
-#ifdef __ILP32__
-#define GNU_PROPERTY(type, value) \
- .section .note.gnu.property, "a"; \
- .p2align 2; \
- .word 4; \
- .word 12; \
- .word 5; \
- .asciz "GNU"; \
- .word type; \
- .word 4; \
- .word value; \
- .text
-#else
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
@@ -46,7 +33,6 @@
.word value; \
.word 0; \
.text
-#endif
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
@@ -80,27 +66,4 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#define L(l) .L ## l
-#ifdef __ILP32__
- /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n) mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
- /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n) mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
-/* Compiler supports SVE instructions */
-#ifndef HAVE_SVE
-# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
-# define HAVE_SVE 1
-# else
-# define HAVE_SVE 0
-# endif
-#endif
-
#endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/experimental/memchr-sve.S
index b851cf31f238..b314551f3e0f 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/experimental/memchr-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,8 +16,6 @@
*/
ENTRY (__memchr_aarch64_sve)
- PTR_ARG (0)
- SIZE_ARG (2)
dup z1.b, w1 /* duplicate c to a vector */
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -59,6 +58,3 @@ ENTRY (__memchr_aarch64_sve)
ret
END (__memchr_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/experimental/memcmp-sve.S
index d52ce4555344..ad3534836d04 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/experimental/memcmp-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,9 +16,6 @@
*/
ENTRY (__memcmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
mov x3, 0 /* initialize off */
0: whilelo p0.b, x3, x2 /* while off < max */
@@ -46,6 +44,3 @@ ENTRY (__memcmp_aarch64_sve)
ret
END (__memcmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/experimental/stpcpy-sve.S
index 5d3f14b86026..5d3f14b86026 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/experimental/stpcpy-sve.S
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/experimental/strchr-sve.S
index ff075167bfef..7d74ae9ff232 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/experimental/strchr-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -22,7 +23,6 @@
#endif
ENTRY (FUNC)
- PTR_ARG (0)
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -65,6 +65,3 @@ ENTRY (FUNC)
b 0b
END (FUNC)
-
-#endif
-
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/experimental/strchrnul-sve.S
index 0005f9177514..0005f9177514 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/experimental/strchrnul-sve.S
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/experimental/strcmp-sve.S
index eaf909a378f1..b6c249588534 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/experimental/strcmp-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,8 +16,6 @@
*/
ENTRY (__strcmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
setffr /* initialize FFR */
ptrue p1.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -54,6 +53,3 @@ ENTRY (__strcmp_aarch64_sve)
b 1b
END (__strcmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/experimental/strcpy-sve.S
index 00e72dce4451..57b77c8a00e7 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/experimental/strcpy-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -22,8 +23,6 @@
#endif
ENTRY (FUNC)
- PTR_ARG (0)
- PTR_ARG (1)
setffr /* initialize FFR */
ptrue p2.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -66,6 +65,3 @@ ENTRY (FUNC)
ret
END (FUNC)
-
-#endif
-
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/experimental/strlen-sve.S
index 12ebbdba5c93..c83155052c07 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/experimental/strlen-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,7 +16,6 @@
*/
ENTRY (__strlen_aarch64_sve)
- PTR_ARG (0)
setffr /* initialize FFR */
ptrue p2.b /* all ones; loop invariant */
mov x1, 0 /* initialize length */
@@ -50,6 +50,3 @@ ENTRY (__strlen_aarch64_sve)
b 0b
END (__strlen_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/experimental/strncmp-sve.S
index 6a9e9f7b6437..a281e642d8aa 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/experimental/strncmp-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,9 +16,6 @@
*/
ENTRY (__strncmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -64,6 +62,3 @@ ENTRY (__strncmp_aarch64_sve)
ret
END (__strncmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/experimental/strnlen-sve.S
index 6c43dc427da7..11d835a1b13c 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/experimental/strnlen-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,8 +16,6 @@
*/
ENTRY (__strnlen_aarch64_sve)
- PTR_ARG (0)
- SIZE_ARG (1)
setffr /* initialize FFR */
mov x2, 0 /* initialize len */
b 1f
@@ -69,6 +68,3 @@ ENTRY (__strnlen_aarch64_sve)
ret
END (__strnlen_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/experimental/strrchr-sve.S
index 825a7384cfc1..731edaddf156 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/experimental/strrchr-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,7 +16,6 @@
*/
ENTRY (__strrchr_aarch64_sve)
- PTR_ARG (0)
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -79,6 +79,3 @@ ENTRY (__strrchr_aarch64_sve)
ret
END (__strrchr_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 948c3cbc7dd4..68bd0af9a8c5 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -40,8 +40,6 @@
exactly which byte matched. */
ENTRY (__memchr_aarch64_mte)
- PTR_ARG (0)
- SIZE_ARG (2)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index fe6cfe2bc0e2..d12a38abbc30 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -47,8 +47,6 @@
*/
ENTRY (__memchr_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
/*
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 35135e72cc8e..43439de4db69 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -30,10 +30,6 @@
ENTRY (__memcmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp limit, 16
b.lo L(less16)
ldp data1, data3, [src1]
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index 9d3027d4d3cd..cbf4c581500e 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -52,9 +52,6 @@
ENTRY_ALIAS (__memmove_aarch64_simd)
ENTRY (__memcpy_aarch64_simd)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
add srcend, src, count
cmp count, 128
b.hi L(copy_long)
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
index b45c31418717..03ae95570c04 100644
--- a/string/aarch64/memcpy-mops.S
+++ b/string/aarch64/memcpy-mops.S
@@ -8,10 +8,6 @@
#include "asmdefs.h"
ENTRY (__memcpy_aarch64_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
.inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index e8a946d7db37..9b05cb2a58ee 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -13,8 +13,6 @@
#include "asmdefs.h"
-#ifdef HAVE_SVE
-
.arch armv8-a+sve
#define dstin x0
@@ -51,10 +49,6 @@
ENTRY_ALIAS (__memmove_aarch64_sve)
ENTRY (__memcpy_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp count, 128
b.hi L(copy_long)
cntb vlen
@@ -173,5 +167,3 @@ L(return):
ret
END (__memcpy_aarch64_sve)
-
-#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 7c0606e2104a..351f1a11f097 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -55,9 +55,6 @@
ENTRY_ALIAS (__memmove_aarch64)
ENTRY (__memcpy_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
index 6c73017bb16f..d9839f86e9b4 100644
--- a/string/aarch64/memmove-mops.S
+++ b/string/aarch64/memmove-mops.S
@@ -8,10 +8,6 @@
#include "asmdefs.h"
ENTRY (__memmove_aarch64_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
.inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 6418bdf56f41..ed38478a6faa 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -42,7 +42,6 @@
exactly which byte matched. */
ENTRY (__memrchr_aarch64)
- PTR_ARG (0)
add end, srcin, cntin
sub endm1, end, 1
bic src, endm1, 15
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
index ec791493bae9..00d8e7d2c05f 100644
--- a/string/aarch64/memset-mops.S
+++ b/string/aarch64/memset-mops.S
@@ -8,9 +8,6 @@
#include "asmdefs.h"
ENTRY (__memset_aarch64_mops)
- PTR_ARG (0)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x19c10443 /* setp [x3]!, x2!, x1 */
.inst 0x19c14443 /* setm [x3]!, x2!, x1 */
diff --git a/string/aarch64/memset-sve.S b/string/aarch64/memset-sve.S
new file mode 100644
index 000000000000..efaeaece284e
--- /dev/null
+++ b/string/aarch64/memset-sve.S
@@ -0,0 +1,114 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2024-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define vlen x5
+#define off x3
+#define dstend2 x5
+
+ENTRY (__memset_aarch64_sve)
+ dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_16)
+
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+
+ .p2align 4
+L(set_16):
+ whilelo p0.b, xzr, count
+ st1b z0.b, p0, [dstin]
+ ret
+
+ .p2align 4
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ cmp count, 256
+ b.lo L(no_zva)
+ tst valw, 255
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ str q0, [dstin]
+ str q0, [dst, 16]
+ bic dst, dstin, 31
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
+ bic x8, x8, 15
+ stp q0, q0, [x8, -48]
+ str q0, [x8, -16]
+ str q0, [dstend, -16]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
+ ret
+
+L(no_zva):
+ str q0, [dstin]
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 16]
+ stp q0, q0, [dst, 48]
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (__memset_aarch64_sve)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 553b0fcaefea..906a4dcf46c6 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2012-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -20,93 +20,98 @@
#define dst x3
#define dstend x4
#define zva_val x5
+#define off x3
+#define dstend2 x5
ENTRY (__memset_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
-
dup v0.16B, valw
- add dstend, dstin, count
-
- cmp count, 96
- b.hi L(set_long)
cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ b.lo L(set_small)
- /* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
ret
+
.p2align 4
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+ /* Set 0..15 bytes. */
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 160
- ccmp valw, 0, 0, hs
+ str q0, [dst, 16]
+ tst valw, 255
b.ne L(no_zva)
-
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
- bic dst, dst, 63
+ bic dst, dstin, 63
sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
.p2align 4
-L(zva_loop):
+L(zva64_loop):
add dst, dst, 64
dc zva, dst
subs count, count, 64
- b.hi L(zva_loop)
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
+ b.hi L(zva64_loop)
ret
+ .p2align 3
L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
L(no_zva_loop):
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
+ stp q0, q0, [dst, 64]
+ add dst, dst, 64
subs count, count, 64
b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
@@ -114,4 +119,3 @@ L(no_zva_loop):
ret
END (__memset_aarch64)
-
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 6ec08f7acc76..42b747311bc6 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -39,7 +39,6 @@
If it is not a multiple of 4, there was no match. */
ENTRY (__strchr_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 37193bd947a7..c1d01e9635b6 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -51,7 +51,6 @@
/* Locals and temporaries. */
ENTRY (__strchr_aarch64)
- PTR_ARG (0)
/* Magic constant 0xc0300c03 to allow us to identify which lane
matches the requested byte. Even bits are set if the character
matches, odd bits if either the char is NUL or matches. */
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 543ee88bb285..b3180cdf9e2c 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -38,7 +38,6 @@
exactly which byte matched. */
ENTRY (__strchrnul_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 666e8d0304c1..0a32c46c30c5 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -47,7 +47,6 @@
/* Locals and temporaries. */
ENTRY (__strchrnul_aarch64)
- PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 137a9aa06681..7c0d0485a89b 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -51,8 +51,6 @@
ENTRY (__strcmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
sub off2, src2, src1
mov zeroones, REP8_01
and tmp, src1, 7
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 97ae37ea4229..5852616e6024 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -52,8 +52,6 @@
exactly which byte matched. */
ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 77235797f7c5..afa72eed9a43 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -33,7 +33,6 @@
identifies the first zero byte. */
ENTRY (__strlen_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
@@ -41,37 +40,50 @@ ENTRY (__strlen_aarch64_mte)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
+L(next16):
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
.p2align 5
L(loop):
- ldr data, [src, 16]
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
+ add src, src, 16
L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
- add result, result, 16
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (__strlen_aarch64_mte)
-
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 6f6f08f636b2..0ebb26be844c 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -75,7 +75,6 @@
character, return the length, if not, continue in the main loop. */
ENTRY (__strlen_aarch64)
- PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
cmp tmp1, MIN_PAGE_SIZE - 32
b.hi L(page_cross)
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 128a10c52bb1..493a0f06ed1d 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -55,9 +55,6 @@
#endif
ENTRY (__strncmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index f2090a7485a5..6a96ec268f1a 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -36,8 +36,6 @@
identifies the first zero byte. */
ENTRY (__strnlen_aarch64)
- PTR_ARG (0)
- SIZE_ARG (1)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index bb61ab9ad4e7..8668ce6d2916 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -42,7 +42,6 @@
if the relevant byte matched the NUL end of string. */
ENTRY (__strrchr_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
movi vrepmask.16b, 0x33
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index bf9cb297b6cb..f5713f4260fb 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,7 +55,6 @@
identify exactly which byte is causing the termination, and why. */
ENTRY (__strrchr_aarch64)
- PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index b628f9b60d96..583fa505db75 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -20,35 +20,18 @@
#define MIN_SIZE 32768
#define MAX_SIZE (1024 * 1024)
-static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun)(void *, const void *, size_t);
-} funtab[] =
-{
-#if __aarch64__
- F(__memcpy_aarch64)
-# if __ARM_NEON
- F(__memcpy_aarch64_simd)
-# endif
-# if __ARM_FEATURE_SVE
- F(__memcpy_aarch64_sve)
-# endif
-# if WANT_MOPS
- F(__memcpy_aarch64_mops)
-# endif
-#elif __arm__
- F(__memcpy_arm)
-#endif
- F(memcpy)
-#undef F
- {0, 0}
-};
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
+
+#define DOTEST(STR,TESTFN) \
+ printf (STR); \
+ RUN (TESTFN, memcpy); \
+ RUNA64 (TESTFN, __memcpy_aarch64); \
+ RUNA64 (TESTFN, __memcpy_aarch64_simd); \
+ RUNSVE (TESTFN, __memcpy_aarch64_sve); \
+ RUNMOPS (TESTFN, __memcpy_aarch64_mops); \
+ RUNA32 (TESTFN, __memcpy_arm); \
+ printf ("\n");
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -160,183 +143,125 @@ init_copies (size_t max_size)
return total;
}
-int main (void)
+static void inline __attribute ((always_inline))
+memcpy_random (const char *name, void *(*fn)(void *, const void *, size_t))
{
- init_copy_distribution ();
-
- memset (a, 1, sizeof (a));
- memset (b, 2, sizeof (b));
-
- printf("Random memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t total = 0;
- uint64_t tsum = 0;
- printf ("%22s ", funtab[f].name);
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t copy_size = init_copies (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
- test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
- test_arr[c].len);
- t = clock_get_ns () - t;
- total += copy_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
- }
- printf( "avg %.2f\n", (double)total / tsum);
- }
-
- size_t total = 0;
- uint64_t tsum = 0;
- printf ("%22s ", "memcpy_call");
- rand32 (0x12345678);
-
+ printf ("%22s ", name);
+ uint64_t total = 0, tsum = 0;
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
- size_t copy_size = init_copies (size) * ITERS;
+ uint64_t copy_size = init_copies (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
- memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+ fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
- memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+ fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
t = clock_get_ns () - t;
total += copy_size;
tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+ printf ("%dK: %5.2f ", size / 1024, (double)copy_size / t);
}
- printf( "avg %.2f\n", (double)total / tsum);
-
+ printf( "avg %5.2f\n", (double)total / tsum);
+}
- printf ("\nAligned medium memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (b, a, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
+static void inline __attribute ((always_inline))
+memcpy_medium_aligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("%22s ", "memcpy_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
- memcpy (b, a, size);
+ fn (b, a, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memcpy_medium_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nUnaligned medium memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (b + 3, a + 1, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memcpy_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
- memcpy (b + 3, a + 1, size);
+ fn (b + 3, a + 1, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memcpy_large (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nLarge memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (b, a, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memcpy_call");
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
- memcpy (b, a, size);
+ fn (b, a, size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memmove_forward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nUnaligned forwards memmove (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+ for (int size = 1024; size <= 65536; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a, a + 256 + (i & 31), size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ fn (a, a + 256 + (i & 31), size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
}
+ printf ("\n");
+}
+
+static void inline __attribute ((always_inline))
+memmove_backward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nUnaligned backwards memmove (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+ for (int size = 1024; size <= 65536; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a + 256 + (i & 31), a, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ fn (a + 256 + (i & 31), a, size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
}
+
printf ("\n");
+}
+
+int main (void)
+{
+ init_copy_distribution ();
+
+ memset (a, 1, sizeof (a));
+ memset (b, 2, sizeof (b));
+
+ DOTEST ("Random memcpy (bytes/ns):\n", memcpy_random);
+ DOTEST ("Medium memcpy aligned (bytes/ns):\n", memcpy_medium_aligned);
+ DOTEST ("Medium memcpy unaligned (bytes/ns):\n", memcpy_medium_unaligned);
+ DOTEST ("Large memcpy (bytes/ns):\n", memcpy_large);
+ DOTEST ("Forwards memmove unaligned (bytes/ns):\n", memmove_forward_unaligned);
+ DOTEST ("Backwards memmove unaligned (bytes/ns):\n", memmove_backward_unaligned);
return 0;
}
diff --git a/string/bench/memset.c b/string/bench/memset.c
index 990e23ba9a36..07474e469146 100644
--- a/string/bench/memset.c
+++ b/string/bench/memset.c
@@ -20,25 +20,16 @@
#define MIN_SIZE 32768
#define MAX_SIZE (1024 * 1024)
-static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(4096)));
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun)(void *, int, size_t);
-} funtab[] =
-{
-#if __aarch64__
- F(__memset_aarch64)
-#elif __arm__
- F(__memset_arm)
-#endif
- F(memset)
-#undef F
- {0, 0}
-};
+#define DOTEST(STR,TESTFN) \
+ printf (STR); \
+ RUN (TESTFN, memset); \
+ RUNA64 (TESTFN, __memset_aarch64); \
+ RUNSVE (TESTFN, __memset_aarch64_sve); \
+ RUNMOPS (TESTFN, __memset_mops); \
+ RUNA32 (TESTFN, __memset_arm); \
+ printf ("\n");
typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
static memset_test_t test_arr[NUM_TESTS];
@@ -127,117 +118,73 @@ init_memset (size_t max_size)
return total;
}
-
-int main (void)
+static void inline __attribute ((always_inline))
+memset_random (const char *name, void *(*set)(void *, int, size_t))
{
- init_memset_distribution ();
-
- memset (a, 1, sizeof (a));
-
- printf("Random memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t total_size = 0;
- uint64_t tsum = 0;
- printf ("%22s ", funtab[f].name);
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t memset_size = init_memset (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
- t = clock_get_ns () - t;
- total_size += memset_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
- }
- printf( "avg %.2f\n", (double)total_size / tsum);
- }
-
- size_t total_size = 0;
+ uint64_t total_size = 0;
uint64_t tsum = 0;
- printf ("%22s ", "memset_call");
+ printf ("%22s ", name);
rand32 (0x12345678);
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
- size_t memset_size = init_memset (size) * ITERS;
+ uint64_t memset_size = init_memset (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
- memset (a + test_arr[c].offset, 0, test_arr[c].len);
+ set (a + test_arr[c].offset, 0, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
- memset (a + test_arr[c].offset, 0, test_arr[c].len);
+ set (a + test_arr[c].offset, 0, test_arr[c].len);
t = clock_get_ns () - t;
total_size += memset_size;
tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+ printf ("%dK: %5.2f ", size / 1024, (double)memset_size / t);
}
- printf( "avg %.2f\n", (double)total_size / tsum);
-
+ printf( "avg %5.2f\n", (double)total_size / tsum);
+}
- printf ("\nMedium memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
+static void inline __attribute ((always_inline))
+memset_medium (const char *name, void *(*set)(void *, int, size_t))
+{
+ printf ("%22s ", name);
- printf ("%22s ", "memset_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
- memset (a, 0, size);
+ set (a, 0, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
}
+ printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memset_large (const char *name, void *(*set)(void *, int, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nLarge memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memset_call");
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
- memset (a, 0, size);
+ set (a, 0, size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%dKB: %6.2f ", size / 1024, (double)size * ITERS3 / t);
}
- printf ("\n\n");
+ printf ("\n");
+}
+
+int main (void)
+{
+ init_memset_distribution ();
+
+ memset (a, 1, sizeof (a));
+ DOTEST ("Random memset (bytes/ns):\n", memset_random);
+ DOTEST ("Medium memset (bytes/ns):\n", memset_medium);
+ DOTEST ("Large memset (bytes/ns):\n", memset_large);
return 0;
}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index f05d0d5b89e6..a8dd55cf5fc4 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -14,40 +14,23 @@
#include "benchlib.h"
#define ITERS 5000
-#define ITERS2 20000000
-#define ITERS3 2000000
-#define NUM_TESTS 16384
+#define ITERS2 40000000
+#define ITERS3 4000000
+#define NUM_TESTS 65536
#define MAX_ALIGN 32
-#define MAX_STRLEN 256
+#define MAX_STRLEN 128
static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
-#define F(x, mte) {#x, x, mte},
-
-static const struct fun
-{
- const char *name;
- size_t (*fun) (const char *s);
- int test_mte;
-} funtab[] = {
- // clang-format off
- F(strlen, 0)
-#if __aarch64__
- F(__strlen_aarch64, 0)
- F(__strlen_aarch64_mte, 1)
-# if __ARM_FEATURE_SVE
- F(__strlen_aarch64_sve, 1)
-# endif
-#elif __arm__
-# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
- F(__strlen_armv6t2, 0)
-# endif
-#endif
- {0, 0, 0}
- // clang-format on
-};
-#undef F
+#define DOTEST(STR,TESTFN) \
+ printf (STR); \
+ RUN (TESTFN, strlen); \
+ RUNA64 (TESTFN, __strlen_aarch64); \
+ RUNA64 (TESTFN, __strlen_aarch64_mte); \
+ RUNSVE (TESTFN, __strlen_aarch64_sve); \
+ RUNT32 (TESTFN, __strlen_armv6t2); \
+ printf ("\n");
static uint16_t strlen_tests[NUM_TESTS];
@@ -124,98 +107,119 @@ init_strlen_tests (void)
strlen_tests[n] =
index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+ assert ((strlen_tests[n] & (align - 1)) == 0);
+ assert (strlen (a + strlen_tests[n]) == exp_len);
}
}
static volatile size_t maskv = 0;
-int main (void)
+static void inline __attribute ((always_inline))
+strlen_random (const char *name, size_t (*fn)(const char *))
{
- rand32 (0x12345678);
- init_strlen_distribution ();
- init_strlen_tests ();
+ size_t res = 0, mask = maskv;
+ uint64_t strlen_size = 0;
+ printf ("%22s ", name);
+
+ for (int c = 0; c < NUM_TESTS; c++)
+ strlen_size += fn (a + strlen_tests[c]) + 1;
+ strlen_size *= ITERS;
+
+ /* Measure throughput of strlen. */
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ res += fn (a + strlen_tests[c]);
+ t = clock_get_ns () - t;
+ printf ("tp: %.3f ", (double)strlen_size / t);
+
+ /* Measure latency of strlen result with (res & mask). */
+ t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ res += fn (a + strlen_tests[c] + (res & mask));
+ t = clock_get_ns () - t;
+ printf ("lat: %.3f\n", (double)strlen_size / t);
+ maskv = res & mask;
+}
- printf ("\nRandom strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t res = 0, strlen_size = 0, mask = maskv;
- printf ("%22s ", funtab[f].name);
+static void inline __attribute ((always_inline))
+strlen_small_aligned (const char *name, size_t (*fn)(const char *))
+{
+ printf ("%22s ", name);
- for (int c = 0; c < NUM_TESTS; c++)
- strlen_size += funtab[f].fun (a + strlen_tests[c]);
- strlen_size *= ITERS;
+ size_t res = 0, mask = maskv;
+ for (int size = 1; size <= 64; size *= 2)
+ {
+ memset (a, 'x', size);
+ a[size - 1] = 0;
- /* Measure latency of strlen result with (res & mask). */
uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+ for (int i = 0; i < ITERS2; i++)
+ res += fn (a + (i & mask));
t = clock_get_ns () - t;
- printf ("%.2f\n", (double)strlen_size / t);
+ printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
}
+ maskv &= res;
+ printf ("\n");
+}
- printf ("\nSmall aligned strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1; size <= 64; size *= 2)
- {
- memset (a, 'x', size);
- a[size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
+static void inline __attribute ((always_inline))
+strlen_small_unaligned (const char *name, size_t (*fn)(const char *))
+{
+ printf ("%22s ", name);
- printf ("\nSmall unaligned strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+ size_t res = 0, mask = maskv;
+ int align = 9;
+ for (int size = 1; size <= 64; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- int align = 9;
- for (int size = 1; size <= 64; size *= 2)
- {
- memset (a + align, 'x', size);
- a[align + size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a + align);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
+ memset (a + align, 'x', size);
+ a[align + size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ res += fn (a + align + (i & mask));
+ t = clock_get_ns () - t;
+ printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
}
+ maskv &= res;
+ printf ("\n");
+}
- printf ("\nMedium strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+static void inline __attribute ((always_inline))
+strlen_medium (const char *name, size_t (*fn)(const char *))
+{
+ printf ("%22s ", name);
+
+ size_t res = 0, mask = maskv;
+ for (int size = 128; size <= 4096; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- for (int size = 128; size <= 4096; size *= 2)
- {
- memset (a, 'x', size);
- a[size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
+ memset (a, 'x', size);
+ a[size - 1] = 0;
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ res += fn (a + (i & mask));
+ t = clock_get_ns () - t;
+ printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ maskv &= res;
printf ("\n");
+}
+
+int main (void)
+{
+ rand32 (0x12345678);
+ init_strlen_distribution ();
+ init_strlen_tests ();
+
+ DOTEST ("Random strlen (bytes/ns):\n", strlen_random);
+ DOTEST ("Small aligned strlen (bytes/ns):\n", strlen_small_aligned);
+ DOTEST ("Small unaligned strlen (bytes/ns):\n", strlen_small_unaligned);
+ DOTEST ("Medium strlen (bytes/ns):\n", strlen_medium);
return 0;
}
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index f1bbea388cd2..486504e99ddf 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -30,4 +30,35 @@ rand32 (uint32_t seed)
return res;
}
+/* Macros to run a benchmark BENCH using string function FN. */
+#define RUN(BENCH, FN) BENCH(#FN, FN)
+#if __aarch64__
+# define RUNA64(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNA64(BENCH, FN)
+#endif
+
+#if __ARM_FEATURE_SVE
+# define RUNSVE(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNSVE(BENCH, FN)
+#endif
+
+#if WANT_MOPS
+# define RUNMOPS(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNMOPS(BENCH, FN)
+#endif
+
+#if __arm__
+# define RUNA32(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNA32(BENCH, FN)
+#endif
+
+#if __arm__ && __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+# define RUNT32(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNT32(BENCH, FN)
+#endif
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 01da7ebfc18d..bb9db930f132 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -33,13 +33,12 @@ char *__strchr_aarch64_mte (const char *, int);
char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
char *__strrchr_aarch64_mte (const char *, int);
-#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
-#endif
# if __ARM_FEATURE_SVE
void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64_sve (void *, int, size_t);
void *__memchr_aarch64_sve (const void *, int, size_t);
int __memcmp_aarch64_sve (const void *, const void *, size_t);
char *__strchr_aarch64_sve (const char *, int);
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index dc95844bd45a..98255e06f31c 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -25,9 +25,7 @@ static const struct fun
F(memcpy, 0)
#if __aarch64__
F(__memcpy_aarch64, 1)
-# if __ARM_NEON
F(__memcpy_aarch64_simd, 1)
-# endif
# if __ARM_FEATURE_SVE
F(__memcpy_aarch64_sve, 1)
# endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index b85dd1e864ef..ff3f7652f763 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -25,9 +25,7 @@ static const struct fun
F(memmove, 0)
#if __aarch64__
F(__memmove_aarch64, 1)
-# if __ARM_NEON
F(__memmove_aarch64_simd, 1)
-# endif
# if __ARM_FEATURE_SVE
F(__memmove_aarch64_sve, 1)
# endif
diff --git a/string/test/memset.c b/string/test/memset.c
index 7d09c267ffec..a9639f9b28b0 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -25,6 +25,9 @@ static const struct fun
F(memset, 0)
#if __aarch64__
F(__memset_aarch64, 1)
+# if __ARM_FEATURE_SVE
+ F(__memset_aarch64_sve, 1)
+# endif
# if WANT_MOPS
F(__memset_aarch64_mops, 1)
# endif