diff options
Diffstat (limited to 'math/aarch64')
250 files changed, 28238 insertions, 834 deletions
diff --git a/math/aarch64/advsimd/acos.c b/math/aarch64/advsimd/acos.c new file mode 100644 index 000000000000..7873a07e6f56 --- /dev/null +++ b/math/aarch64/advsimd/acos.c @@ -0,0 +1,122 @@ +/* + * Double-precision vector acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[12]; + float64x2_t pi, pi_over_2; + uint64x2_t abs_mask; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, + .pi = V2 (0x1.921fb54442d18p+1), + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define Oneu 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-53. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (acos, x, y, special); +} +#endif + +/* Double-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (Oneu - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x), + vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax)); + float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float64x2_t y = vbslq_f64 (d->abs_mask, p, x); + + uint64x2_t is_neg = vcltzq_f64 (x); + float64x2_t off = vreinterpretq_f64_u64 ( + vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi))); + float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0)); + float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off); + + return vfmaq_f64 (add, mul, y); +} + +TEST_SIG (V, D, 1, acos, -1.0, 1.0) +TEST_ULP (V_NAME_D1 (acos), 1.02) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000) +TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000) diff --git a/math/aarch64/advsimd/acosf.c b/math/aarch64/advsimd/acosf.c new file mode 100644 index 000000000000..e200f792c764 --- /dev/null +++ b/math/aarch64/advsimd/acosf.c @@ -0,0 +1,115 @@ +/* + * Single-precision vector acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f, pif; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), + .pif = V4 (0x1.921fb6p+1f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x32800000 /* 2^-26. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (acosf, x, y, special); +} +#endif + +/* Single-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.26 ulps, + _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x); + + uint32x4_t is_neg = vcltzq_f32 (x); + float32x4_t off = vreinterpretq_f32_u32 ( + vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg)); + float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0)); + float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off); + + return vfmaq_f32 (add, mul, y); +} + +HALF_WIDTH_ALIAS_F1 (acos) + +TEST_SIG (V, F, 1, acos, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (acos), 0.82) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000) +TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000) diff --git a/math/aarch64/advsimd/acosh.c b/math/aarch64/advsimd/acosh.c new file mode 100644 index 000000000000..55d8ed5a421e --- /dev/null +++ b/math/aarch64/advsimd/acosh.c @@ -0,0 +1,65 @@ +/* + * Double-precision vector acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 1 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one, thresh; +} data = { + .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */ +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special, + const struct v_log1p_data *d) +{ + return v_call_f64 (acosh, x, log1p_inline (y, d), special); +} + +/* Vector approximation for double-precision acosh, based on log1p. + The largest observed error is 3.02 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 + want 0x1.f2d6d823bc9e2p-5. */ +VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh); + float64x2_t special_arg = x; + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (special))) + x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); +#endif + + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0)); + float64x2_t y = vaddq_f64 (x, v_f64 (1.0)); + y = vmulq_f64 (y, xm1); + y = vsqrtq_f64 (y); + y = vaddq_f64 (xm1, y); + + if (unlikely (v_any_u64 (special))) + return special_case (special_arg, y, special, &d->log1p_consts); + return log1p_inline (y, &d->log1p_consts); +} + +TEST_SIG (V, D, 1, acosh, 1.0, 10.0) +TEST_ULP (V_NAME_D1 (acosh), 2.53) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000) +TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000) +TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000) +TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/math/aarch64/advsimd/acoshf.c b/math/aarch64/advsimd/acoshf.c new file mode 100644 index 000000000000..029d457cfa8a --- /dev/null +++ b/math/aarch64/advsimd/acoshf.c @@ -0,0 +1,78 @@ +/* + * Single-precision vector acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +#define SquareLim 0x1p64 + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; +} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) }; + +#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint16x4_t special, + const struct v_log1pf_data *d) +{ + return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); +} + +/* Vector approximation for single-precision acosh, based on log1p. Maximum + error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it + is 3.00 ULP: + _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4 + want 0x1.ef0a7cp-4. + With exceptions disabled, we can compute u with a shorter dependency chain, + which gives maximum error of 3.22 ULP: + _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5 + want 0x1.fdcdd2p-5. */ + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh); + +#if WANT_SIMD_EXCEPT + /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use + only xm1 to calculate u, as operating on x will trigger invalid for NaN. + Widening sign-extend special predicate in order to mask with it. */ + uint32x4_t p + = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special))); + float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); + float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); +#else + float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one)); + float32x4_t u + = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one))); +#endif + + float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, special, &d->log1pf_consts); + return log1pf_inline (y, &d->log1pf_consts); +} + +HALF_WIDTH_ALIAS_F1 (acosh) + +TEST_SIG (V, F, 1, acosh, 1.0, 10.0) +#if WANT_SIMD_EXCEPT +TEST_ULP (V_NAME_F1 (acosh), 2.50) +#else +TEST_ULP (V_NAME_F1 (acosh), 2.78) +#endif +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500) +TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000) +TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000) +TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/math/aarch64/advsimd/asin.c b/math/aarch64/advsimd/asin.c new file mode 100644 index 000000000000..c751d9264a12 --- /dev/null +++ b/math/aarch64/advsimd/asin.c @@ -0,0 +1,130 @@ +/* + * Double-precision vector asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10; + float64x2_t pi_over_2; + uint64x2_t abs_mask; + double c1, c3, c5, c7, c9, c11; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define One 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (asin, x, y, special); +} +#endif + +/* Double-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate exceptions are raised. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (One - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x), + vfmsq_n_f64 (v_f64 (0.5), ax, 0.5)); + float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + + /* order-11 estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + float64x2_t p = vfmaq_f64 (p07, z16, p811); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0)); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +TEST_SIG (V, D, 1, asin, -1.0, 1.0) +TEST_ULP (V_NAME_D1 (asin), 2.20) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000) +TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000) diff --git a/math/aarch64/advsimd/asinf.c b/math/aarch64/advsimd/asinf.c new file mode 100644 index 000000000000..970feb37e1d5 --- /dev/null +++ b/math/aarch64/advsimd/asinf.c @@ -0,0 +1,106 @@ +/* + * Single-precision vector asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x39800000 /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (asinf, x, y, special); +} +#endif + +/* Single-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate fp exceptions are raised. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float32x4_t y + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); + + /* Copy sign. */ + return vbslq_f32 (v_u32 (AbsMask), y, x); +} + +HALF_WIDTH_ALIAS_F1 (asin) + +TEST_SIG (V, F, 1, asin, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (asin), 1.91) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000) +TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000) diff --git a/math/aarch64/advsimd/asinh.c b/math/aarch64/advsimd/asinh.c new file mode 100644 index 000000000000..550302826bd9 --- /dev/null +++ b/math/aarch64/advsimd/asinh.c @@ -0,0 +1,242 @@ +/* + * Double-precision vector asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "test_defs.h" +#include "test_sig.h" +#include "v_math.h" + +const static struct data +{ + uint64x2_t huge_bound, abs_mask, off, mask; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound; +#endif + float64x2_t lc0, lc2; + double lc1, lc3, ln2, lc4; + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; + double c1, c3, c5, c7, c9, c11, c13, c15; + +} data = { + +#if WANT_SIMD_EXCEPT + .tiny_bound = V2 (0x1p-26), +#endif + /* Even terms of polynomial s.t. asinh(x) is approximated by + asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). + Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ + + .c0 = V2 (-0x1.55555555554a7p-3), + .c1 = 0x1.3333333326c7p-4, + .c2 = V2 (-0x1.6db6db68332e6p-5), + .c3 = 0x1.f1c71b26fb40dp-6, + .c4 = V2 (-0x1.6e8b8b654a621p-6), + .c5 = 0x1.1c4daa9e67871p-6, + .c6 = V2 (-0x1.c9871d10885afp-7), + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c8 = V2 (-0x1.3ddca533e9f54p-7), + .c9 = 0x1.0becef748dafcp-7, + .c10 = V2 (-0x1.b90c7099dd397p-8), + .c11 = 0x1.541f2bb1ffe51p-8, + .c12 = V2 (-0x1.d217026a669ecp-9), + .c13 = 0x1.0b5c7977aaf7p-9, + .c14 = V2 (-0x1.e0f37daef9127p-11), + .c15 = 0x1.388b5fe542a6p-12, + .c16 = V2 (-0x1.021a48685e287p-14), + .c17 = V2 (0x1.93d4ba83d34dap-18), + + .lc0 = V2 (-0x1.ffffffffffff7p-2), + .lc1 = 0x1.55555555170d4p-2, + .lc2 = V2 (-0x1.0000000399c27p-2), + .lc3 = 0x1.999b2e90e94cap-3, + .lc4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + + .off = V2 (0x3fe6900900000000), + .huge_bound = V2 (0x5fe0000000000000), + .abs_mask = V2 (0x7fffffffffffffff), + .mask = V2 (0xfffULL << 52), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask, + uint64x2_t special) +{ + /* Copy sign. */ + y = vbslq_f64 (abs_mask, y, x); + return v_call_f64 (asinh, x, y, special); +} + +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +log_inline (float64x2_t xm, const struct data *d) +{ + + uint64x2_t u = vreinterpretq_u64_f64 (xm); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->lc1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1); + y = vfmaq_f64 (p, r2, y); + return vfmaq_f64 (hi, y, r2); +} + +/* Double-precision implementation of vector asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.79 ULP, in + |x| >= 1: + _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1 + want 0x1.ffffd003219ddp-1. */ +VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t ax = vabsq_f64 (x); + + uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); + +#if WANT_SIMD_EXCEPT + uint64x2_t iax = vreinterpretq_u64_f64 (ax); + uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound)); + uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); + special = vorrq_u64 (special, tiny); +#else + uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound)); +#endif + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). + If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will + overflow, by setting special lanes to 1. These will be fixed later. */ + float64x2_t option_1 = v_f64 (0); + if (likely (v_any_u64 (gt1))) + { +#if WANT_SIMD_EXCEPT + float64x2_t xm = v_zerofy_f64 (ax, special); +#else + float64x2_t xm = ax; +#endif + option_1 = log_inline ( + vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will + overflow, and tiny lanes, which will underflow, by setting them to 0. They + will be fixed later, either by selecting x or falling back to the scalar + special-case. The largest observed error in this region is 1.47 ULPs: + _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. */ + float64x2_t option_2 = v_f64 (0); + + if (likely (v_any_u64 (vceqzq_u64 (gt1)))) + { + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); +#endif + float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2); + /* Order-17 Pairwise Horner scheme. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1); + float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17); + + float64x2_t p = vfmaq_f64 (p1415, z2, p1617); + p = vfmaq_f64 (p1213, z2, p); + p = vfmaq_f64 (p1011, z2, p); + p = vfmaq_f64 (p89, z2, p); + + p = vfmaq_f64 (p67, z2, p); + p = vfmaq_f64 (p45, z2, p); + + p = vfmaq_f64 (p23, z2, p); + + p = vfmaq_f64 (p01, z2, p); + option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2)); +#if WANT_SIMD_EXCEPT + option_2 = vbslq_f64 (tiny, x, option_2); +#endif + } + + /* Choose the right option for each lane. */ + float64x2_t y = vbslq_f64 (gt1, option_1, option_2); + if (unlikely (v_any_u64 (special))) + { + return special_case (x, y, d->abs_mask, special); + } + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +TEST_SIG (V, D, 1, asinh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (asinh), 2.29) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the v_sel is choosing the right option in all cases. */ +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5) +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2) +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600) diff --git a/math/aarch64/advsimd/asinhf.c b/math/aarch64/advsimd/asinhf.c new file mode 100644 index 000000000000..6a96f6ee9f4b --- /dev/null +++ b/math/aarch64/advsimd/asinhf.c @@ -0,0 +1,89 @@ +/* + * Single-precision vector asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + float32x4_t one; + uint32x4_t big_bound; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .one = V4 (1), + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t sign, float32x4_t y, + uint32x4_t special, const struct data *d) +{ + return v_call_f32 ( + asinhf, x, + vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))), + special); +} + +/* Single-precision implementation of vector asinh(x), using vector log1p. + Worst-case error is 2.59 ULP: + _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3 + want 0x1.d449c4p-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); + uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax); + float32x4_t special_arg = x; + +#if WANT_SIMD_EXCEPT + /* Sidestep tiny and large values to avoid inadvertently triggering + under/overflow. */ + special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); + if (unlikely (v_any_u32 (special))) + { + ax = v_zerofy_f32 (ax, special); + x = v_zerofy_f32 (x, special); + } +#endif + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + float32x4_t d + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax))); + float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)); + + if (unlikely (v_any_u32 (special))) + return special_case (special_arg, sign, y, special, dat); + return vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts)))); +} + +HALF_WIDTH_ALIAS_F1 (asinh) + +TEST_SIG (V, F, 1, asinh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (asinh), 2.10) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000) diff --git a/math/aarch64/advsimd/atan.c b/math/aarch64/advsimd/atan.c new file mode 100644 index 000000000000..26d264321068 --- /dev/null +++ b/math/aarch64/advsimd/atan.c @@ -0,0 +1,135 @@ +/* + * Double-precision vector atan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), +}; + +#define SignMask v_u64 (0x8000000000000000) +#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */ +#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */ + +/* Fast implementation of vector atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: + _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t sign = vandq_u64 (ix, SignMask); + +#if WANT_SIMD_EXCEPT + uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000)); + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)), + v_u64 (BigBound - TinyBound)); + /* If any lane is special, fall back to the scalar routine for all lanes. */ + if (unlikely (v_any_u64 (special))) + return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1)); +#endif + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); + /* Use absolute value only when needed (odd powers of z). */ + float64x2_t az = vbslq_f64 ( + SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t y = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); + y = vaddq_f64 (y, shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); + return y; +} + +TEST_SIG (V, D, 1, atan, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (atan), 1.78) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000) +TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000) +TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000) +TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000) +TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000) +TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000) diff --git a/math/aarch64/advsimd/atan2.c b/math/aarch64/advsimd/atan2.c new file mode 100644 index 000000000000..18c4b70b92f6 --- /dev/null +++ b/math/aarch64/advsimd/atan2.c @@ -0,0 +1,171 @@ +/* + * Double-precision vector atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + uint64x2_t zeroinfnan, minustwo; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), + .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), + .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), + .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), + .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), + .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), + .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), + .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), + .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), + .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), + .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), + .minustwo = V2 (0xc000000000000000), +}; + +#define SignMask v_u64 (0x8000000000000000) + +/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, + uint64x2_t sign_xy, uint64x2_t cmp) +{ + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + return v_call2_f64 (atan2, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline uint64x2_t +zeroinfnan (uint64x2_t i, const struct data *d) +{ + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); +} + +/* Fast implementation of vector atan2. + Maximum observed error is 2.8 ulps: + _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) + got 0x1.92d628ab678ccp-1 + want 0x1.92d628ab678cfp-1. */ +float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t iy = vreinterpretq_u64_f64 (y); + + uint64x2_t special_cases + = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint64x2_t sign_x = vandq_u64 (ix, SignMask); + uint64x2_t sign_y = vandq_u64 (iy, SignMask); + uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t pred_xlt0 = vcltzq_f64 (x); + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); + + /* Set up z for call to atan. */ + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, q); + + /* Work out the correct shift. */ + float64x2_t shift + = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); + shift = vmulq_f64 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t ret = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); + ret = vaddq_f64 (ret, shift); + + if (unlikely (v_any_u64 (special_cases))) + return special_case (y, x, ret, sign_xy, special_cases); + + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + + return ret; +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (V, D, 2, atan2) +// TODO tighten this once __v_atan2 is fixed +TEST_ULP (V_NAME_D2 (atan2), 2.9) +TEST_DISABLE_FENV (V_NAME_D2 (atan2)) +TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000) +TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000) diff --git a/math/aarch64/advsimd/atan2f.c b/math/aarch64/advsimd/atan2f.c new file mode 100644 index 000000000000..632014249ab0 --- /dev/null +++ b/math/aarch64/advsimd/atan2f.c @@ -0,0 +1,127 @@ +/* + * Single-precision vector atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, pi_over_2, c4, c6, c2; + float c1, c3, c5, c7; + uint32x4_t comp_const; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, + .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, + .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, + .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, + .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), +}; + +#define SignMask v_u32 (0x80000000) + +/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, + uint32x4_t sign_xy, uint32x4_t cmp) +{ + /* Account for the sign of y. */ + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); + return v_call2_f32 (atan2f, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline uint32x4_t +zeroinfnan (uint32x4_t i, const struct data *d) +{ + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); +} + +/* Fast implementation of vector atan2f. Maximum observed error is + 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: + _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t iy = vreinterpretq_u32_f32 (y); + + uint32x4_t special_cases + = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint32x4_t sign_x = vandq_u32 (ix, SignMask); + uint32x4_t sign_y = vandq_u32 (iy, SignMask); + uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t pred_xlt0 = vcltzq_f32 (x); + uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); + + /* Set up z for call to atanf. */ + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); + float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, q); + + /* Work out the correct shift. */ + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); + shift = vmulq_f32 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + + float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); + + /* y = shift + z * P(z^2). */ + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); + + if (unlikely (v_any_u32 (special_cases))) + { + return special_case (y, x, ret, sign_xy, special_cases); + } + + /* Account for the sign of y. */ + return vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); +} + +HALF_WIDTH_ALIAS_F2 (atan2) + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (V, F, 2, atan2) +TEST_DISABLE_FENV (V_NAME_F2 (atan2)) +TEST_ULP (V_NAME_F2 (atan2), 2.46) +TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000) +TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000) diff --git a/math/aarch64/advsimd/atanf.c b/math/aarch64/advsimd/atanf.c new file mode 100644 index 000000000000..61927c9b261a --- /dev/null +++ b/math/aarch64/advsimd/atanf.c @@ -0,0 +1,109 @@ +/* + * Single-precision vector atan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" + +static const struct data +{ + float32x4_t poly[8]; + float32x4_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, + .pi_over_2 = V4 (0x1.921fb6p+0f), +}; + +#define SignMask v_u32 (0x80000000) + +#define P(i) d->poly[i] + +#define TinyBound 0x30800000 /* asuint(0x1p-30). */ +#define BigBound 0x4e800000 /* asuint(0x1p30). */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (atanf, x, y, special); +} +#endif + +/* Fast implementation of vector atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] + using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: + _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t sign = vandq_u32 (ix, SignMask); + +#if WANT_SIMD_EXCEPT + uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); + uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), + v_u32 (BigBound - TinyBound)); + /* If any lane is special, fall back to the scalar routine for all lanes. */ + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (-1)); +#endif + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); + /* Use absolute value only when needed (odd powers of z). */ + float32x4_t az = vbslq_f32 ( + SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t y = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); + + /* y = shift + z * P(z^2). */ + y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); + + return y; +} + +HALF_WIDTH_ALIAS_F1 (atan) + +TEST_SIG (V, F, 1, atan, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (atan), 2.5) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000) diff --git a/math/aarch64/advsimd/atanh.c b/math/aarch64/advsimd/atanh.c new file mode 100644 index 000000000000..c2f9585dd29b --- /dev/null +++ b/math/aarch64/advsimd/atanh.c @@ -0,0 +1,75 @@ +/* + * Double-precision vector atanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one; + uint64x2_t sign_mask; +} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .sign_mask = V2 (0x8000000000000000) }; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y, + uint64x2_t special, const struct data *d) +{ + y = log1p_inline (y, &d->log1p_consts); + return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x), + vmulq_f64 (halfsign, y), special); +} + +/* Approximation for vector double-precision atanh(x) using modified log1p. + The greatest observed error is 3.31 ULP: + _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (atanh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5)); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (ax); + uint64x2_t special = vcgeq_u64 (ia, d->one); + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, special); +#endif + + float64x2_t y; + y = vaddq_f64 (ax, ax); + y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); + + if (unlikely (v_any_u64 (special))) +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special, d); +#else + return special_case (ax, halfsign, y, special, d); +#endif + + y = log1p_inline (y, &d->log1p_consts); + return vmulq_f64 (y, halfsign); +} + +TEST_SIG (V, D, 1, atanh, -1.0, 1.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT) +TEST_ULP (V_NAME_D1 (atanh), 3.32) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0) diff --git a/math/aarch64/advsimd/atanhf.c b/math/aarch64/advsimd/atanhf.c new file mode 100644 index 000000000000..313d15ca6391 --- /dev/null +++ b/math/aarch64/advsimd/atanhf.c @@ -0,0 +1,90 @@ +/* + * Single-precision vector atanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .one = V4 (0x3f800000), +#if WANT_SIMD_EXCEPT + /* 0x1p-12, below which atanhf(x) rounds to x. */ + .tiny_bound = V4 (0x39800000), +#endif +}; + +#define AbsMask v_u32 (0x7fffffff) +#define Half v_u32 (0x3f000000) + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y, + uint32x4_t special) +{ + return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign), + vmulq_f32 (halfsign, y), special); +} + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 2.93 ULP: + _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5 + want 0x1.f4dcf8p-5. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + +#if WANT_SIMD_EXCEPT + uint32x4_t special + = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound)); + /* Side-step special cases by setting those lanes to 0, which will trigger no + exceptions. These will be fixed up later. */ + if (unlikely (v_any_u32 (special))) + ax = v_zerofy_f32 (ax, special); +#else + uint32x4_t special = vcgeq_u32 (iax, d->one); +#endif + + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), + vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); + y = log1pf_inline (y, &d->log1pf_consts); + + /* If exceptions not required, pass ax to special-case for shorter dependency + chain. If exceptions are required ax will have been zerofied, so have to + pass x. */ + if (unlikely (v_any_u32 (special))) +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special); +#else + return special_case (ax, halfsign, y, special); +#endif + return vmulq_f32 (halfsign, y); +} + +HALF_WIDTH_ALIAS_F1 (atanh) + +TEST_SIG (V, F, 1, atanh, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (atanh), 2.44) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0) diff --git a/math/aarch64/advsimd/cbrt.c b/math/aarch64/advsimd/cbrt.c new file mode 100644 index 000000000000..8e72e5b566fc --- /dev/null +++ b/math/aarch64/advsimd/cbrt.c @@ -0,0 +1,127 @@ +/* + * Double-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f64.h" + +const static struct data +{ + float64x2_t poly[4], one_third, shift; + int64x2_t exp_bias; + uint64x2_t abs_mask, tiny_bound; + uint32x4_t thresh; + double table[5]; +} data = { + .shift = V2 (0x1.8p52), + .poly = { /* Generated with fpminimax in [0.5, 1]. */ + V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1), + V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) }, + .exp_bias = V2 (1022), + .abs_mask = V2(0x7fffffffffffffff), + .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */ + .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */ + .one_third = V2(0x1.5555555555555p-2), + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 } +}; + +#define MantissaMask v_u64 (0x000fffffffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint32x2_t special) +{ + return v_call_f64 (cbrt, x, y, vmovl_u32 (special)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. + + The vector version of frexp does not handle subnormals + correctly. As a result these need to be handled by the scalar + fallback, where accuracy may be worse than that of the vector code + path. + + Greatest observed error in the normal range is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + /* Subnormal, +/-0 and special values. */ + uint32x2_t special + = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5)); + int64x2_t exp_bias = d->exp_bias; + uint64x2_t ia12 = vshrq_n_u64 (iax, 52); + int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly); + float64x2_t one_third = d->one_third; + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + float64x2_t m_by_3 = vmulq_f64 (m, one_third); + float64x2_t two_thirds = vaddq_f64 (one_third, one_third); + float64x2_t a + = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p); + a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + float64x2_t ef = vcvtq_f64_s64 (e); + float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third)); + int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3))); + int64x2_t ey = vcvtq_s64_f64 (eb3f); + + float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] }; + my = vmulq_f64 (my, a); + + /* Vector version of ldexp. */ + float64x2_t y = vreinterpretq_f64_s64 ( + vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52)); + y = vmulq_f64 (y, my); + + if (unlikely (v_any_u32h (special))) + return special_case (x, vbslq_f64 (d->abs_mask, y, x), special); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which + has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error + in the vector path is 1.79 ULP. + [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical + Functions in Single, Double, Double Extended, and Quadruple Precision. */ +TEST_ULP (V_NAME_D1 (cbrt), 3.17) +TEST_SIG (V, D, 1, cbrt, -10.0, 10.0) +TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/math/aarch64/advsimd/cbrtf.c b/math/aarch64/advsimd/cbrtf.c new file mode 100644 index 000000000000..4e76feb2dd8b --- /dev/null +++ b/math/aarch64/advsimd/cbrtf.c @@ -0,0 +1,117 @@ +/* + * Single-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" + +const static struct data +{ + float32x4_t poly[4], one_third; + float table[5]; +} data = { + .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with + FPMinimax. */ + V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1), + V4 (0x1.2c74c2p-3) }, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = V4 (0x1.555556p-2f), +}; + +#define SignMask v_u32 (0x80000000) +#define SmallestNormal v_u32 (0x00800000) +#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask v_u32 (0x007fffff) +#define HalfExp v_u32 (0x3f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint16x4_t special) +{ + return v_call_f32 (cbrtf, x, y, vmovl_u16 (special)); +} + +static inline float32x4_t +shifted_lookup (const float *table, int32x4_t i) +{ + return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2], + table[i[3] + 2] }; +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + /* Subnormal, +/-0 and special values. */ + uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5)); + int32x4_t e + = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126)); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly); + + float32x4_t one_third = d->one_third; + float32x4_t two_thirds = vaddq_f32 (one_third, one_third); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float32x4_t m_by_3 = vmulq_f32 (m, one_third); + float32x4_t a + = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third); + int32x4_t ey = vcvtq_s32_f32 (ef); + int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3))); + + float32x4_t my = shifted_lookup (d->table, em3); + my = vmulq_f32 (my, a); + + /* Vector version of ldexpf. */ + float32x4_t y + = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23)); + y = vmulq_f32 (y, my); + + if (unlikely (v_any_u16h (special))) + return special_case (x, vbslq_f32 (SignMask, x, y), special); + + /* Copy sign. */ + return vbslq_f32 (SignMask, x, y); +} + +HALF_WIDTH_ALIAS_F1 (cbrt) + +TEST_SIG (V, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (cbrt), 1.15) +TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/math/aarch64/advsimd/cexpi.c b/math/aarch64/advsimd/cexpi.c new file mode 100644 index 000000000000..40ba5ff31f20 --- /dev/null +++ b/math/aarch64/advsimd/cexpi.c @@ -0,0 +1,47 @@ +/* + * Double-precision vector sincos function - return-by-value interface. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincos_common.h" +#include "v_math.h" +#include "test_defs.h" + +static float64x2x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y) +{ + return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special), + v_call_f64 (cos, x, y.val[1], special) }; +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR float64x2x2_t +_ZGVnN2v_cexpi (float64x2_t x) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, sc); + return sc; +} + +TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos) +TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin) +TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73) +TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73) +#define V_CEXPI_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n) +V_CEXPI_INTERVAL (0, 0x1p23, 500000) +V_CEXPI_INTERVAL (-0, -0x1p23, 500000) +V_CEXPI_INTERVAL (0x1p23, inf, 10000) +V_CEXPI_INTERVAL (-0x1p23, -inf, 10000) diff --git a/math/aarch64/advsimd/cexpif.c b/math/aarch64/advsimd/cexpif.c new file mode 100644 index 000000000000..e55d99653a66 --- /dev/null +++ b/math/aarch64/advsimd/cexpif.c @@ -0,0 +1,49 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "test_defs.h" + +static float32x4x2_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y) +{ + return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special), + v_call_f32 (cosf, x, y.val[1], special) }; +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR float32x4x2_t +_ZGVnN4v_cexpif (float32x4_t x) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, sc); + return sc; +} + +TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin) +TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos) +TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17) +TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31) +#define V_CEXPIF_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n) +V_CEXPIF_INTERVAL (0, 0x1p20, 500000) +V_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +V_CEXPIF_INTERVAL (0x1p20, inf, 10000) +V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/math/aarch64/v_cos.c b/math/aarch64/advsimd/cos.c index 9a73575bce89..9f3de4dd5c36 100644 --- a/math/aarch64/v_cos.c +++ b/math/aarch64/advsimd/cos.c @@ -1,17 +1,19 @@ /* * Double-precision vector cos function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { float64x2_t poly[7]; - float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), @@ -19,11 +21,9 @@ static const struct data V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), V2 (-0x1.9e9540300a1p-41) }, .inv_pi = V2 (0x1.45f306dc9c883p-2), - .half_pi = V2 (0x1.921fb54442d18p+0), .pi_1 = V2 (0x1.921fb54442d18p+1), .pi_2 = V2 (0x1.1a62633145c06p-53), .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), .range_val = V2 (0x1p23) }; @@ -57,10 +57,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) #endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); - n = vsubq_f64 (n, v_f64 (0.5)); + n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); + n = vsubq_f64 (n, v_f64 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f64 (r, d->pi_1, n); @@ -85,3 +84,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } + +TEST_SIG (V, D, 1, cos, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (cos), 3.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000) diff --git a/math/aarch64/v_cosf.c b/math/aarch64/advsimd/cosf.c index b9890b2998ad..d2844e44e196 100644 --- a/math/aarch64/v_cosf.c +++ b/math/aarch64/advsimd/cosf.c @@ -1,17 +1,19 @@ /* * Single-precision vector cos function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { float32x4_t poly[4]; - float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* 1.886 ulp error. */ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), @@ -22,8 +24,6 @@ static const struct data .pi_3 = V4 (-0x1.ee59dap-49f), .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), - .half_pi = V4 (0x1.921fb6p0f), .range_val = V4 (0x1p20f) }; @@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) return v_call_f32 (cosf, x, y, cmp); } -float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) { const struct data *d = ptr_barrier (&data); float32x4_t n, r, r2, r3, y; @@ -58,9 +58,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) #endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); + n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); n = vsubq_f32 (n, v_f32 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ @@ -80,3 +79,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } + +HALF_WIDTH_ALIAS_F1 (cos) + +TEST_SIG (V, F, 1, cos, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (cos), 1.4) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000) diff --git a/math/aarch64/advsimd/cosh.c b/math/aarch64/advsimd/cosh.c new file mode 100644 index 000000000000..54407b23aa9d --- /dev/null +++ b/math/aarch64/advsimd/cosh.c @@ -0,0 +1,107 @@ +/* + * Double-precision vector cosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[3]; + float64x2_t inv_ln2; + double ln2[2]; + float64x2_t shift, thres; + uint64x2_t index_mask, special_bound; +} data = { + .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5), }, + + .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + /* -ln2/N. */ + .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64}, + .shift = V2 (0x1.8p+52), + .thres = V2 (704.0), + + .index_mask = V2 (0xff), + /* 0x1.6p9, above which exp overflows. */ + .special_bound = V2 (0x4086000000000000), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from v_exp_tail, with no + special-case handling or tail. */ +static inline float64x2_t +exp_inline (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* n = round(x/(ln2/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*ln2/N. */ + float64x2_t ln2 = vld1q_f64 (d->ln2); + float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0); + r = vfmaq_laneq_f64 (r, n, ln2, 1); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, d->index_mask); + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r); + y = vfmaq_f64 (d->poly[0], y, r); + y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_tail_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + return vfmaq_f64 (s, y, s); +} + +/* Approximation for vector double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 + want 0x1.f711dcb0c77b1p+7. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t special + = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + float64x2_t t = exp_inline (ax); + float64x2_t half_t = vmulq_n_f64 (t, 0.5); + float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t); + + /* Fall back to scalar for any special cases. */ + if (unlikely (v_any_u64 (special))) + return special_case (x, vaddq_f64 (half_t, half_over_t), special); + + return vaddq_f64 (half_t, half_over_t); +} + +TEST_SIG (V, D, 1, cosh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (cosh), 1.43) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/math/aarch64/advsimd/coshf.c b/math/aarch64/advsimd/coshf.c new file mode 100644 index 000000000000..f1ed3e5161fd --- /dev/null +++ b/math/aarch64/advsimd/coshf.c @@ -0,0 +1,92 @@ +/* + * Single-precision vector cosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_expf_inline.h" +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + struct v_expf_data expf_consts; + uint32x4_t tiny_bound; + float32x4_t bound; +#if WANT_SIMD_EXCEPT + uint32x4_t special_bound; +#endif +} data = { + .expf_consts = V_EXPF_DATA, + .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .bound = V4 (0x1.5a92d8p+6), +#if WANT_SIMD_EXCEPT + .special_bound = V4 (0x42ad496c), +#endif +}; + +#if !WANT_SIMD_EXCEPT +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, + uint32x4_t special) +{ + return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); +} +#endif + +/* Single-precision vector cosh, using vector expf. + Maximum error is 2.38 ULP: + _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 + want 0x1.6a4922p+4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, fall back to the scalar + variant for all inputs if any input is a special value or above the bound + at which expf overflows. */ + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); + if (unlikely (v_any_u32 (special))) + return v_call_f32 (coshf, x, x, v_u32 (-1)); + + uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound); + /* If any input is tiny, avoid underflow exception by fixing tiny lanes of + input to 0, which will generate no exceptions. */ + if (unlikely (v_any_u32 (tiny))) + ax = v_zerofy_f32 (ax, tiny); + float32x4_t t = v_expf_inline (ax, &d->expf_consts); +#else + uint32x4_t special = vcageq_f32 (x, d->bound); + float32x4_t t = v_expf_inline (x, &d->expf_consts); +#endif + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + float32x4_t half_t = vmulq_n_f32 (t, 0.5); + float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (tiny))) + return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); +#else + if (unlikely (v_any_u32 (special))) + return special_case (x, half_t, half_over_t, special); +#endif + + return vaddq_f32 (half_t, half_over_t); +} + +HALF_WIDTH_ALIAS_F1 (cosh) + +TEST_SIG (V, F, 1, cosh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (cosh), 1.89) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/math/aarch64/advsimd/cospi.c b/math/aarch64/advsimd/cospi.c new file mode 100644 index 000000000000..e63201a55786 --- /dev/null +++ b/math/aarch64/advsimd/cospi.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cospi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[10]; + float64x2_t range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (arm_math_cospi, x, y, cmp); +} + +/* Approximation for vector double-precision cospi(x). + Maximum Error 3.06 ULP: + _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1 + want 0x1.fa854babfb6c1p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float64x2_t r = vabsq_f64 (x); + uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f64 (r, cmp); + uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63); + +#else + float64x2_t r = x; + uint64x2_t cmp = vcageq_f64 (r, d->range_val); + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + +#endif + + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_D1 (cospi), 2.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/cospif.c b/math/aarch64/advsimd/cospif.c new file mode 100644 index 000000000000..62f4b8122b2c --- /dev/null +++ b/math/aarch64/advsimd/cospif.c @@ -0,0 +1,86 @@ +/* + * Single-precision vector cospi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[6]; + float32x4_t range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (arm_math_cospif, x, y, cmp); +} + +/* Approximation for vector single-precision cospi(x) + Maximum Error: 3.17 ULP: + _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float32x4_t r = vabsq_f32 (x); + uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f32 (r, cmp); + uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31); + +#else + float32x4_t r = x; + uint32x4_t cmp = vcageq_f32 (r, d->range_val); + + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + +#endif + + /* r = x - rint(x). */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +HALF_WIDTH_ALIAS_F1 (cospi) + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_F1 (cospi), 2.67) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/erf.c b/math/aarch64/advsimd/erf.c new file mode 100644 index 000000000000..40717a660ce2 --- /dev/null +++ b/math/aarch64/advsimd/erf.c @@ -0,0 +1,166 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t third; + float64x2_t tenth, two_over_five, two_over_nine; + double two_over_fifteen, two_over_fortyfive; + float64x2_t max, shift; + uint64x2_t max_idx; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound, huge_bound, scale_minus_one; +#endif +} data = { + .max_idx = V2 (768), + .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = 0x1.1111111111111p-3, + .tenth = V2 (-0x1.999999999999ap-4), + .two_over_five = V2 (-0x1.999999999999ap-2), + .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, + .max = V2 (5.9921875), /* 6 - 1/128. */ + .shift = V2 (0x1p45), +#if WANT_SIMD_EXCEPT + .huge_bound = V2 (0x1p205), + .tiny_bound = V2 (0x1p-226), + .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffffffffffff + +struct entry +{ + float64x2_t erf; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf), + e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf); + e.erf = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + + float64x2_t a = vabsq_f64 (x); + /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs + to return expected results. */ + uint64x2_t a_le_max = vcaleq_f64 (x, dat->max); + uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max); + +#if WANT_SIMD_EXCEPT + /* |x| huge or tiny. */ + uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound); + uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound); + uint64x2_t cmp = vorrq_u64 (cmp1, cmp2); + /* If any lanes are special, mask them with 1 for small x or 8 for large + values and retain a copy of a to allow special case handler to fix special + lanes later. This is only necessary if fenv exceptions are to be triggered + correctly. */ + if (unlikely (v_any_u64 (cmp))) + { + a = vbslq_f64 (cmp1, v_f64 (8.0), a); + a = vbslq_f64 (cmp2, v_f64 (1.0), a); + } +#endif + + /* Set r to multiple of 1/128 nearest to |x|. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Lookup erf(r) and scale(r) in table, without shortcut for small values, + but with saturated indices for large values and NaNs in order to avoid + segfault. */ + uint64x2_t i + = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); + i = vbslq_u64 (a_le_max, i, dat->max_idx); + struct entry e = lookup (i); + + float64x2_t r = vsubq_f64 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t two_over_fifteen_and_fortyfive + = vld1q_f64 (&dat->two_over_fifteen); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + float64x2_t p1 = r; + float64x2_t p2 + = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); + float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2, + two_over_fifteen_and_fortyfive, 0); + p4 = vfmsq_f64 (dat->tenth, r2, p4); + float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2, + two_over_fifteen_and_fortyfive, 1); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); + + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p34, d2, p5); + y = vfmaq_f64 (p12, d2, y); + + y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = vbslq_f64 (a_gt_max, v_f64 (1.0), y); + + /* Copy sign. */ + y = vbslq_f64 (v_u64 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp2))) + { + /* Neutralise huge values of x before fixing small values. */ + x = vbslq_f64 (cmp1, v_f64 (1.0), x); + /* Fix tiny values that trigger spurious underflow. */ + return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y); + } +#endif + return y; +} + +TEST_SIG (V, D, 1, erf, -6.0, 6.0) +TEST_ULP (V_NAME_D1 (erf), 1.79) +/* WANT_SIMD_EXCEPT blocks miss some cases. */ +TEST_DISABLE_FENV (V_NAME_D1 (erf)) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000) diff --git a/math/aarch64/advsimd/erfc.c b/math/aarch64/advsimd/erfc.c new file mode 100644 index 000000000000..97ef09ecc113 --- /dev/null +++ b/math/aarch64/advsimd/erfc.c @@ -0,0 +1,205 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t offset, table_scale; + float64x2_t max, shift; + float64x2_t p20, p40, p41, p51; + double p42, p52; + double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; +#if WANT_SIMD_EXCEPT + float64x2_t uflow_bound; +#endif +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .offset = V2 (0xbd3ffffffffff260), + .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */ + .max = V2 (0x1.b3ep+4), /* 3487/128. */ + .shift = V2 (0x1p45), + .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ + .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ + .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ + /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, + .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, + .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 }, + .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 }, + .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 }, +#if WANT_SIMD_EXCEPT + .uflow_bound = V2 (0x1.a8b12fc6e4892p+4), +#endif +}; + +#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */ +#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */ + +struct entry +{ + float64x2_t erfc; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); + float64x2_t e2 + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); + e.erfc = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + return v_call_f64 (erfc, x, y, cmp); +} +#endif + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (erfc) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound)); + /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x), + vreinterpretq_s64_f64 (dat->uflow_bound)); + cmp = vorrq_u64 (cmp, uflow); + float64x2_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u64 (cmp))) + x = v_zerofy_f64 (x, cmp); +#endif + + float64x2_t a = vabsq_f64 (x); + a = vminq_f64 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Clamp index to a range of 3487. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float64x2_t r = vsubq_f64 (z, shift); + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t p1 = r; + float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); + float64x2_t p42_p52 = vld1q_f64 (&dat->p42); + float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0); + p4 = vfmsq_f64 (dat->p40, r2, p4); + float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6), + qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8), + qr9 = vld1q_f64 (dat->qr9); + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0)); + p6 = vmulq_laneq_f64 (p6, qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0)); + p7 = vmulq_laneq_f64 (p7, qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0)); + p8 = vmulq_laneq_f64 (p8, qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0)); + p9 = vmulq_laneq_f64 (p9, qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0)); + p10 = vmulq_laneq_f64 (p10, qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + float64x2_t p90 = vfmaq_f64 (p9, d, p10); + float64x2_t p78 = vfmaq_f64 (p7, d, p8); + float64x2_t p56 = vfmaq_f64 (p5, d, p6); + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p78, d2, p90); + y = vfmaq_f64 (p56, d2, y); + y = vfmaq_f64 (p34, d2, y); + y = vfmaq_f64 (p12, d2, y); + + y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63); + float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float64x2_t fac = vreinterpretq_f64_u64 ( + vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (xm, vfmaq_f64 (off, fac, y), cmp); +#endif + + return vfmaq_f64 (off, fac, y); +} + +TEST_SIG (V, D, 1, erfc, -6.0, 28.0) +TEST_ULP (V_NAME_D1 (erfc), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000) diff --git a/math/aarch64/advsimd/erfcf.c b/math/aarch64/advsimd/erfcf.c new file mode 100644 index 000000000000..f420439ef8a3 --- /dev/null +++ b/math/aarch64/advsimd/erfcf.c @@ -0,0 +1,174 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint32x4_t offset, table_scale; + float32x4_t max, shift; + float coeffs[4]; + float32x4_t third, two_over_five, tenth; +#if WANT_SIMD_EXCEPT + float32x4_t uflow_bound; +#endif + +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */ + .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */ + .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */ + .shift = V4 (0x1p17f), + /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and + fmas. */ + .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .third = V4 (0x1.555556p-2f), + .two_over_five = V4 (-0x1.99999ap-2f), + .tenth = V4 (-0x1.99999ap-4f), +#if WANT_SIMD_EXCEPT + .uflow_bound = V4 (0x1.2639cp+3f), +#endif +}; + +#define TinyBound 0x41000000 /* 0x1p-62f << 1. */ +#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ +#define Off 0xfffffd7b /* 0xffffffff - 644. */ + +struct entry +{ + float32x4_t erfc; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float32x2_t t0 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); + float32x2_t t1 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); + float32x2_t t2 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); + float32x2_t t3 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); + e.erfc = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + return v_call_f32 (erfcf, x, y, cmp); +} +#endif + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); + /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), + vreinterpretq_s32_f32 (dat->uflow_bound)); + cmp = vorrq_u32 (cmp, uflow); + float32x4_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + float32x4_t a = vabsq_f32 (x); + a = vminq_f32 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + /* Clamp index to a range of 644. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float32x4_t r = vsubq_f32 (z, shift); + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t p1 = r; + float32x4_t coeffs = vld1q_f32 (dat->coeffs); + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1); + float32x4_t p3 + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2); + p4 = vfmsq_f32 (dat->tenth, r2, p4); + + float32x4_t y = vfmaq_f32 (p3, d, p4); + y = vfmaq_f32 (p2, d, y); + y = vfmaq_f32 (p1, d, y); + y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31); + float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float32x4_t fac = vreinterpretq_f32_u32 ( + vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (xm, vfmaq_f32 (off, fac, y), cmp); +#endif + + return vfmaq_f32 (off, fac, y); +} + +HALF_WIDTH_ALIAS_F1 (erfc) + +TEST_SIG (V, F, 1, erfc, -4.0, 10.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT) +TEST_ULP (V_NAME_F1 (erfc), 1.14) +TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/math/aarch64/advsimd/erff.c b/math/aarch64/advsimd/erff.c new file mode 100644 index 000000000000..508bc4c2f5e2 --- /dev/null +++ b/math/aarch64/advsimd/erff.c @@ -0,0 +1,120 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t max, shift, third; +#if WANT_SIMD_EXCEPT + float32x4_t tiny_bound, scale_minus_one; +#endif +} data = { + .max = V4 (3.9375), /* 4 - 8/128. */ + .shift = V4 (0x1p16f), + .third = V4 (0x1.555556p-2f), /* 1/3. */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x1p-62f), + .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffff + +struct entry +{ + float32x4_t erf; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf); + float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf); + float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf); + float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); + e.erf = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. */ + uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + float32x4_t a = vabsq_f32 (x); + uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max); + + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + uint32x4_t i + = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift)); + i = vminq_u32 (i, v_u32 (512)); + struct entry e = lookup (i); + + float32x4_t r = vsubq_f32 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t y = vfmaq_f32 (r, dat->third, d); + y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y)); + + /* Solves the |x| = inf case. */ + y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y); + + /* Copy sign. */ + y = vbslq_f32 (v_u32 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y); +#endif + return y; +} + +HALF_WIDTH_ALIAS_F1 (erf) + +TEST_SIG (V, F, 1, erf, -4.0, 4.0) +TEST_ULP (V_NAME_F1 (erf), 1.43) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000) diff --git a/math/aarch64/v_exp.c b/math/aarch64/advsimd/exp.c index bc5609faf4fc..a928c35c9418 100644 --- a/math/aarch64/v_exp.c +++ b/math/aarch64/advsimd/exp.c @@ -1,12 +1,14 @@ /* * Double-precision vector e^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" #define N (1 << V_EXP_TABLE_BITS) #define IndexMask (N - 1) @@ -123,3 +125,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) return vfmaq_f64 (s, y, s); } + +TEST_SIG (V, D, 1, exp, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp), 1.9) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000) diff --git a/math/aarch64/advsimd/exp10.c b/math/aarch64/advsimd/exp10.c new file mode 100644 index 000000000000..24fdd1c7d257 --- /dev/null +++ b/math/aarch64/advsimd/exp10.c @@ -0,0 +1,147 @@ +/* + * Double-precision vector 10^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "mathlib.h" +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */ +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 163840.0 /* 1280.0 * N. */ + +const static struct data +{ + float64x2_t poly[4]; + float64x2_t log10_2, log2_10_hi, log2_10_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.5ddf8f28p-54 + abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ] + maxerr: 1.14432 +0.5 ulp. */ + .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1), + V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) }, + .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */ + .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */ + .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66), + .shift = V2 (0x1.8p+52), +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (ScaleBound), + .special_bound = V2 (SpecialBound), +#endif +}; + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask v_u64 (N - 1) + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ +# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f64 (exp10, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp10. + Maximum measured error is 1.64 ulp. + _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5 + want 0x1.f8dab6d7fed0ap+5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcageq_f64 (x, d->special_bound); +#endif + + /* n = round(x/(log10(2)/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*log10(2)/N. */ + float64x2_t r = x; + r = vfmsq_f64 (r, d->log2_10_hi, n); + r = vfmsq_f64 (r, d->log2_10_lo, n); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, IndexMask); + + /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]); + float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]); + p = vfmaq_f64 (p, y, r2); + y = vmulq_f64 (r, p); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n, d); +#endif + + return vfmaq_f64 (s, y, s); +} + +#if WANT_EXP10_TESTS +TEST_SIG (S, D, 1, exp10, -9.9, 9.9) +TEST_SIG (V, D, 1, exp10, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp10), 1.15) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/exp10f.c b/math/aarch64/advsimd/exp10f.c new file mode 100644 index 000000000000..eb0d5dd0d57c --- /dev/null +++ b/math/aarch64/advsimd/exp10f.c @@ -0,0 +1,147 @@ +/* + * Single-precision vector 10^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" + +#define ScaleBound 192.0f + +static const struct data +{ + float32x4_t c0, c1, c3; + float log10_2_high, log10_2_low, c2, c4; + float32x4_t inv_log10_2, special_bound; + uint32x4_t exponent_bias, special_offset, special_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 1.85943 +0.5 ulp. */ + .c0 = V4 (0x1.26bb16p+1f), + .c1 = V4 (0x1.5350d2p+1f), + .c2 = 0x1.04744ap+1f, + .c3 = V4 (0x1.2d8176p+0f), + .c4 = 0x1.12b41ap-1f, + .inv_log10_2 = V4 (0x1.a934fp+1), + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + .special_bound = V4 (126.0f), + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), +#if !WANT_SIMD_EXCEPT + .scale_thresh = V4 (ScaleBound) +#endif +}; + +#if WANT_SIMD_EXCEPT + +# define SpecialBound 38.0f /* rint(log10(2^127)). */ +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ +# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (exp10f, x, y, cmp); +} + +#else + +# define SpecialBound 126.0f + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +/* Fast vector implementation of single-precision exp10. + Algorithm is accurate to 2.36 ULP. + _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 + want 0x1.7e79cp+11. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t cmp = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ + float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0); + r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); +#endif + + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3); + float32x4_t p14 = vfmaq_f32 (p12, r2, p34); + float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} + +HALF_WIDTH_ALIAS_F1 (exp10) + +#if WANT_EXP10_TESTS +TEST_SIG (S, F, 1, exp10, -9.9, 9.9) +TEST_SIG (V, F, 1, exp10, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp10), 1.86) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/exp2.c b/math/aarch64/advsimd/exp2.c new file mode 100644 index 000000000000..63448d806b82 --- /dev/null +++ b/math/aarch64/advsimd/exp2.c @@ -0,0 +1,128 @@ +/* + * Double-precision vector 2^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define BigBound 1022.0 +#define UOFlowBound 1280.0 +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ + +static const struct data +{ + float64x2_t poly[4]; + float64x2_t shift, scale_big_bound, scale_uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3), + V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) }, + .shift = V2 (0x1.8p52 / N), + .scale_big_bound = V2 (BigBound), + .scale_uoflow_bound = V2 (UOFlowBound), +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){ __v_exp_data[i[0] & IndexMask], + __v_exp_data[i[1] & IndexMask] }; +} + +#if WANT_SIMD_EXCEPT + +# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ + +/* Call scalar exp2 as a fallback. */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) +{ + return v_call_f64 (exp2, x, y, is_special); +} + +#else + +# define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset)); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b)); + float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (exp2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); + /* Mask special lanes and retain a copy of x for passing to special-case + handler. */ + float64x2_t xc = x; + x = v_zerofy_f64 (x, cmp); +#else + cmp = vcagtq_f64 (x, d->scale_big_bound); +#endif + + /* n = round(x/N). */ + float64x2_t z = vaddq_f64 (d->shift, x); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n/N. */ + float64x2_t r = vsubq_f64 (x, n); + + /* s = 2^(n/N). */ + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + u = lookup_sbits (u); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + /* y ~ exp2(r) - 1. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); + y = vmulq_f64 (r, y); + + if (unlikely (v_any_u64 (cmp))) +#if !WANT_SIMD_EXCEPT + return special_case (s, y, n, d); +#else + return special_case (xc, vfmaq_f64 (s, s, y), cmp); +#endif + return vfmaq_f64 (s, s, y); +} + +TEST_SIG (V, D, 1, exp2, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp2), 1.15) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000) diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/advsimd/exp2f.c index e402205e98e6..40f6170d3702 100644 --- a/math/aarch64/v_exp2f.c +++ b/math/aarch64/advsimd/exp2f.c @@ -1,33 +1,38 @@ /* * Single-precision vector 2^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { - float32x4_t poly[5]; - uint32x4_t exponent_bias; + float32x4_t c1, c3; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; + float32x4_t scale_thresh, special_bound; #endif + float c0, c2, c4, zero; } data = { /* maxerr: 1.962 ulp. */ - .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), - V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .c0 = 0x1.59977ap-10f, + .c1 = V4 (0x1.3ce9e4p-7f), + .c2 = 0x1.c6bd32p-5f, + .c3 = V4 (0x1.ebf9bcp-3f), + .c4 = 0x1.62e422p-1f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), #endif }; -#define C(i) d->poly[i] - #if WANT_SIMD_EXCEPT # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ @@ -44,16 +49,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -66,16 +68,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, #endif -float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly; - uint32x4_t cmp, e; #if WANT_SIMD_EXCEPT /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); float32x4_t xm = x; /* If any lanes are special, mask them with 1 and retain a copy of x to allow special_case to fix special lanes later. This is only necessary if fenv @@ -84,23 +84,24 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) x = vbslq_f32 (cmp, v_f32 (1), x); #endif - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ - n = vrndaq_f32 (x); - r = vsubq_f32 (x, n); - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = vsubq_f32 (x, n); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t c024 = vld1q_f32 (&d->c0); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_laneq_f32 (r, c024, 2); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT @@ -111,3 +112,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) return vfmaq_f32 (scale, poly, scale); } + +HALF_WIDTH_ALIAS_F1 (exp2) + +TEST_SIG (V, F, 1, exp2, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp2), 1.49) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/advsimd/exp2f_1u.c b/math/aarch64/advsimd/exp2f_1u.c new file mode 100644 index 000000000000..1f8e89ab658f --- /dev/null +++ b/math/aarch64/advsimd/exp2f_1u.c @@ -0,0 +1,73 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c1, c2, c3, c4, c5, shift; + uint32x4_t exponent_bias; + float32x4_t special_bound, scale_thresh; + uint32x4_t special_offset, special_bias; +} data = { + .shift = V4 (0x1.8p23f), + .exponent_bias = V4 (0x3f800000), + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), + /* maxerr: 0.878 ulp. */ + .c0 = V4 (0x1.416b5ep-13f), + .c1 = V4 (0x1.5f082ep-10f), + .c2 = V4 (0x1.3b2dep-7f), + .c3 = V4 (0x1.c6af7cp-5f), + .c4 = V4 (0x1.ebfbdcp-3f), + .c5 = V4 (0x1.62e43p-1f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r1 = vmulq_f32 (s1, s1); + float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ + /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + const struct data *d = ptr_barrier (&data); + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = x - n; + uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; + float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + + float32x4_t p = vfmaq_f32 (d->c1, d->c0, r); + p = vfmaq_f32 (d->c2, p, r); + p = vfmaq_f32 (d->c3, p, r); + p = vfmaq_f32 (d->c4, p, r); + p = vfmaq_f32 (d->c5, p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (p, n, e, d); + return scale * p; +} + +TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4) +TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u) +TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/v_expf.c b/math/aarch64/advsimd/expf.c index 34e8b6081bcd..e5b1f020d1a0 100644 --- a/math/aarch64/v_expf.c +++ b/math/aarch64/advsimd/expf.c @@ -1,30 +1,34 @@ /* * Single-precision vector e^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ - -#include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { - float32x4_t poly[5]; - float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; - uint32x4_t exponent_bias; + float32x4_t c1, c3, c4, inv_ln2; + float ln2_hi, ln2_lo, c0, c2; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t special_bound, scale_thresh; #endif } data = { /* maxerr: 1.45358 +0.5 ulp. */ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, - .shift = V4 (0x1.8p23f), + .c0 = 0x1.0e4020p-7f, + .c1 = V4 (0x1.573e2ep-5f), + .c2 = 0x1.555e66p-3f, + .c3 = V4 (0x1.fffdb6p-2f), + .c4 = V4 (0x1.ffffecp-1f), .inv_ln2 = V4 (0x1.715476p+0f), - .ln2_hi = V4 (0x1.62e4p-1f), - .ln2_lo = V4 (0x1.7f7d1cp-20f), + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), @@ -49,19 +53,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); + // (s2 + p*s2)*s1 = s2(p+1)s1 float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); /* Similar to r1 but avoids double rounding in the subnormal range. */ float32x4_t r0 = vfmaq_f32 (scale, poly, scale); @@ -71,15 +73,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, #endif -float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly, z; - uint32x4_t cmp, e; + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); #if WANT_SIMD_EXCEPT /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - cmp = vcgeq_u32 ( + uint32x4_t cmp = vcgeq_u32 ( vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), TinyBound), SpecialBound); @@ -93,23 +94,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - z = vfmaq_f32 (d->shift, x, d->inv_ln2); - n = vsubq_f32 (z, d->shift); - r = vfmsq_f32 (x, n, d->ln2_hi); - r = vfmsq_f32 (r, n, d->ln2_lo); - e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT @@ -120,3 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) return vfmaq_f32 (scale, poly, scale); } + +HALF_WIDTH_ALIAS_F1 (exp) + +TEST_SIG (V, F, 1, exp, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp), 1.49) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/advsimd/expf_1u.c b/math/aarch64/advsimd/expf_1u.c new file mode 100644 index 000000000000..4e114d810e08 --- /dev/null +++ b/math/aarch64/advsimd/expf_1u.c @@ -0,0 +1,79 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t shift, inv_ln2; + uint32x4_t exponent_bias; + float32x4_t c1, c2, c3, c4; + float32x4_t special_bound, scale_thresh; + uint32x4_t special_offset, special_bias; + float ln2_hi, ln2_lo, c0, nothing; +} data = { + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .exponent_bias = V4 (0x3f800000), + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), + .special_offset = V4 (0x83000000), + .special_bias = V4 (0x7f000000), + /* maxerr: 0.36565 +0.5 ulp. */ + .c0 = 0x1.6a6000p-10f, + .c1 = V4 (0x1.12718ep-7f), + .c2 = V4 (0x1.555af0p-5f), + .c3 = V4 (0x1.555430p-3f), + .c4 = V4 (0x1.fffff4p-2f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r1 = vmulq_f32 (s1, s1); + float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi); + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t z = vmulq_f32 (x, d->inv_ln2); + float32x4_t n = vrndaq_f32 (z); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c0, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2); + p = vfmaq_f32 (d->c2, p, r); + p = vfmaq_f32 (d->c3, p, r); + p = vfmaq_f32 (d->c4, p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (p, n, e, d); + return scale * p; +} + +TEST_ULP (_ZGVnN4v_expf_1u, 0.4) +TEST_DISABLE_FENV (_ZGVnN4v_expf_1u) +TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/advsimd/expm1.c b/math/aarch64/advsimd/expm1.c new file mode 100644 index 000000000000..7535a1830427 --- /dev/null +++ b/math/aarch64/advsimd/expm1.c @@ -0,0 +1,77 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; +#if WANT_SIMD_EXCEPT + uint64x2_t thresh, tiny_bound; +#else + float64x2_t oflow_bound; +#endif +} data = { + .d = V_EXPM1_DATA, +#if WANT_SIMD_EXCEPT + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs + compare. */ + .thresh = V2 (0x78c56fa6d34b552), + /* asuint64(0x1p-51) << 1. */ + .tiny_bound = V2 (0x3cc0000000000000 << 1), +#else + /* Value above which expm1(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), +#endif +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, const struct data *d) +{ + return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d), + special); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.05 ULP: + _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2 + want 0x1.a8897eef87b32p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ix = vreinterpretq_u64_f64 (x); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); +#else + /* Large input, NaNs and Infs. */ + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); +#endif + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, d); + + /* expm1(x) ~= p * t + (t - 1). */ + return expm1_inline (x, &d->d); +} + +TEST_SIG (V, D, 1, expm1, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (expm1), 1.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100) diff --git a/math/aarch64/advsimd/expm1f.c b/math/aarch64/advsimd/expm1f.c new file mode 100644 index 000000000000..6d4431dcd8a5 --- /dev/null +++ b/math/aarch64/advsimd/expm1f.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data d; +#if WANT_SIMD_EXCEPT + uint32x4_t thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + .d = V_EXPM1F_DATA, +#if !WANT_SIMD_EXCEPT + /* Value above which expm1f(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V4 (0x1.5ebc4p+6), +#else + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute + compare. */ + .thresh = V4 (0x1d5ebc40), +#endif +}; + +/* asuint(0x1p-23), shifted by 1 for abs compare. */ +#define TinyBound v_u32 (0x34000000 << 1) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, const struct data *d) +{ + return v_call_f32 ( + expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special); +} + +/* Single-precision vector exp(x) - 1 function. + The maximum error is 1.62 ULP: + _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2 + want 0x1.da9f44p-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ix = vreinterpretq_u32_f32 (x); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint32x4_t special + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); +#else + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); +#endif + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, d); + + /* expm1(x) ~= p * t + (t - 1). */ + return expm1f_inline (x, &d->d); +} + +HALF_WIDTH_ALIAS_F1 (expm1) + +TEST_SIG (V, F, 1, expm1, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (expm1), 1.13) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000) +TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000) diff --git a/math/aarch64/advsimd/finite_pow.h b/math/aarch64/advsimd/finite_pow.h new file mode 100644 index 000000000000..0c8350a1a77b --- /dev/null +++ b/math/aarch64/advsimd/finite_pow.h @@ -0,0 +1,361 @@ +/* + * Double-precision x^y function. + * + * Copyright (c) 2018-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Scalar version of pow used for fallbacks in vector implementations. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define Off 0x3fe6955500000000 +#define As __v_pow_log_data.poly + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.0). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define InvLn2N __v_pow_exp_data.n_over_ln2 +#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi +#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo +#define SBits __v_pow_exp_data.sbits +#define Cs __v_pow_exp_data.poly + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline double +log_inline (uint64_t ix, double *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64_t tmp = ix - Off; + int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1); + int k = (int64_t) tmp >> 52; /* arithmetic shift. */ + uint64_t iz = ix - (tmp & 0xfffULL << 52); + double z = asdouble (iz); + double kd = (double) k; + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + double invc = __v_pow_log_data.invc[i]; + double logc = __v_pow_log_data.logc[i]; + double logctail = __v_pow_log_data.logctail[i]; + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + double r = fma (z, invc, -1.0); + + /* k*Ln2 + log(c) + r. */ + double t1 = kd * __v_pow_log_data.ln2_hi + logc; + double t2 = t1 + r; + double lo1 = kd * __v_pow_log_data.ln2_lo + logctail; + double lo2 = t1 - t2 + r; + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double ar = As[0] * r; + double ar2 = r * ar; + double ar3 = r * ar2; + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + double hi = t2 + ar2; + double lo3 = fma (ar, r, -ar2); + double lo4 = t2 - hi + ar2; + /* p = log1p(r) - r - A[0]*r*r. */ + double p = (ar3 + * (As[1] + r * As[2] + + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6])))); + double lo = lo1 + lo2 + lo3 + lo4 + p; + double y = hi + lo; + *tail = hi - y + lo; + return y; +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +special_case (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return y; + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + y = scale + scale * tmp; +#if WANT_SIMD_EXCEPT + if (fabs (y) < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double hi, lo, one = 1.0; + if (y < 0.0) + one = -1.0; + lo = scale - y + scale * tmp; + hi = one + y; + lo = one - hi + y + lo; + y = (hi + lo) - one; + /* Fix the sign of 0. */ + if (y == 0.0) + y = asdouble (sbits & 0x8000000000000000); + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } +#endif + y = 0x1p-1022 * y; + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline double +exp_inline (double x, double xtail, uint32_t sign_bias) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + if (abstop - SmallExp >= 0x80000000) + { + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return sign_bias ? -1.0 : 1.0; + } + if (abstop >= top12 (1024.0)) + { + /* Note: inf and nan are already handled. */ + /* Skip errno handling. */ +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (sign_bias) + : __math_oflow (sign_bias); +#else + double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY; + return sign_bias ? -res_uoflow : res_uoflow; +#endif + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return scale + scale * tmp; +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + A version of exp_inline that is not inlined and for which sign_bias is + equal to 0. */ +static double NOINLINE +exp_nosignbias (double x, double xtail) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + /* Avoid spurious underflow for tiny x. */ + if (abstop - SmallExp >= 0x80000000) + return 1.0; + /* Note: inf and nan are already handled. */ + if (abstop >= top12 (1024.0)) +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0); +#else + return asuint64 (x) >> 63 ? 0.0 : INFINITY; +#endif + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return scale + scale * tmp; +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +static double NOINLINE +pow_scalar_special_case (double x, double y) +{ + uint32_t sign_bias = 0; + uint64_t ix, iy; + uint32_t topx, topy; + + ix = asuint64 (x); + iy = asuint64 (y); + topx = top12 (x); + topy = top12 (y); + if (unlikely (topx - SmallPowX >= ThresPowX + || (topy & 0x7ff) - SmallPowY >= ThresPowY)) + { + /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 + and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */ + /* Special cases: (x < 0x1p-126 or inf or nan) or + (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) + || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + { + x2 = -x2; + sign_bias = 1; + } +#if WANT_SIMD_EXCEPT + if (2 * ix == 0 && iy >> 63) + return __math_divzero (sign_bias); +#endif + return iy >> 63 ? 1 / x2 : x2; + } + /* Here x and y are non-zero finite. */ + if (ix >> 63) + { + /* Finite x < 0. */ + int yint = checkint (iy); + if (yint == 0) +#if WANT_SIMD_EXCEPT + return __math_invalid (x); +#else + return __builtin_nan (""); +#endif + if (yint == 1) + sign_bias = SignBias; + ix &= 0x7fffffffffffffff; + topx &= 0x7ff; + } + if ((topy & 0x7ff) - SmallPowY >= ThresPowY) + { + /* Note: sign_bias == 0 here because y is not odd. */ + if (ix == asuint64 (1.0)) + return 1.0; + /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ + if ((topy & 0x7ff) < SmallPowY) + return 1.0; +#if WANT_SIMD_EXCEPT + return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0) + : __math_uflow (0); +#else + return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0; +#endif + } + if (topx == 0) + { + /* Normalize subnormal x so exponent becomes negative. */ + ix = asuint64 (x * 0x1p52); + ix &= 0x7fffffffffffffff; + ix -= 52ULL << 52; + } + } + + double lo; + double hi = log_inline (ix, &lo); + double ehi = y * hi; + double elo = y * lo + fma (y, hi, -ehi); + return exp_inline (ehi, elo, sign_bias); +} diff --git a/math/aarch64/advsimd/hypot.c b/math/aarch64/advsimd/hypot.c new file mode 100644 index 000000000000..dc01ed5bac93 --- /dev/null +++ b/math/aarch64/advsimd/hypot.c @@ -0,0 +1,95 @@ +/* + * Double-precision vector hypot(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint64x2_t tiny_bound, thres; +} data = { + .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ + .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ +}; +#else +static const struct data +{ + uint64x2_t tiny_bound; + uint32x4_t thres; +} data = { + .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, + uint32x2_t special) +{ + return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special)); +} + +/* Vector implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222) + got 0x1.6a1b19400964ep-204 + want 0x1.6a1b19400964dp-204. */ +#if WANT_SIMD_EXCEPT + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t ix = vreinterpretq_u64_f64 (ax); + uint64x2_t iy = vreinterpretq_u64_f64 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres); + uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f64 (ax, specialx); + ay = v_zerofy_f64 (ay, specialy); + uint32x2_t special = vaddhn_u64 (specialx, specialy); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#else + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y); + + uint32x2_t special + = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), + vget_low_u32 (d->thres)); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#endif + +TEST_SIG (V, D, 2, hypot, -10.0, 10.0) +TEST_ULP (V_NAME_D2 (hypot), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT) +TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/math/aarch64/advsimd/hypotf.c b/math/aarch64/advsimd/hypotf.c new file mode 100644 index 000000000000..69634875be5a --- /dev/null +++ b/math/aarch64/advsimd/hypotf.c @@ -0,0 +1,96 @@ +/* + * Single-precision vector hypot(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint32x4_t tiny_bound, thres; +} data = { + .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ + .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ +}; +#else +static const struct data +{ + uint32x4_t tiny_bound; + uint16x8_t thres; +} data = { + .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ + .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, + uint16x4_t special) +{ + return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special)); +} + +/* Vector implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 + want 0x1.6a41dp-13. */ +#if WANT_SIMD_EXCEPT + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t ix = vreinterpretq_u32_f32 (ax); + uint32x4_t iy = vreinterpretq_u32_f32 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); + uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f32 (ax, specialx); + ay = v_zerofy_f32 (ay, specialy); + uint16x4_t special = vaddhn_u32 (specialx, specialy); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#else + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); + + uint16x4_t special + = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), + vget_low_u16 (d->thres)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#endif + +HALF_WIDTH_ALIAS_F2 (hypot) + +TEST_SIG (V, F, 2, hypot, -10.0, 10.0) +TEST_ULP (V_NAME_F2 (hypot), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) +TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/math/aarch64/advsimd/log.c b/math/aarch64/advsimd/log.c new file mode 100644 index 000000000000..94e3f4482079 --- /dev/null +++ b/math/aarch64/advsimd/log.c @@ -0,0 +1,118 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + float64x2_t c0, c2; + double c1, c3, ln2, c4; +} data = { + /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .c0 = V2 (-0x1.ffffffffffff7p-2), + .c1 = 0x1.55555555170d4p-2, + .c2 = V2 (-0x1.0000000399c27p-2), + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .sign_exp_mask = V2 (0xfff0000000000000), + .off = V2 (0x3fe6900900000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */ +}; + +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Double-precision vector log routine. + The maximum observed error is 2.17 ULP: + _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log), 1.67) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000) +TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000) +TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000) diff --git a/math/aarch64/advsimd/log10.c b/math/aarch64/advsimd/log10.c new file mode 100644 index 000000000000..c2b8f1c54f0e --- /dev/null +++ b/math/aarch64/advsimd/log10.c @@ -0,0 +1,132 @@ +/* + * Double-precision vector log10(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + double invln10, log10_2; + double c1, c3; + float64x2_t c0, c2, c4; +} data = { + /* Computed from log coefficients divided by log(10) then rounded to double + precision. */ + .c0 = V2 (-0x1.bcb7b1526e506p-3), + .c1 = 0x1.287a7636be1d1p-3, + .c2 = V2 (-0x1.bcb7b158af938p-4), + .c3 = 0x1.63c78734e6d07p-4, + .c4 = V2 (-0x1.287461742fee4p-4), + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ +}; + +#define N (1 << V_LOG10_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t log10c; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log10c = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Fast implementation of double-precision vector log10 + is a slight modification of double-precision vector log. + Max ULP error: < 2.5 ulp (nearest rounding.) + Maximum measured at 2.46 ulp for x in [0.96, 0.97] + _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 + want 0x1.fff6be3cae4b9p-6. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r / log(10) + log10(c) + k*log10(2). + Constants in v_log10_data.c are computed (in extended precision) as + e.log10c := e.logc * invln10. */ + float64x2_t cte = vld1q_f64 (&d->invln10); + float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0); + + /* y = log10(1+r) + n * log10(2). */ + hi = vfmaq_laneq_f64 (hi, kd, cte, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_f64 (y, d->c4, r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log10, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log10), 1.97) +TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000) +TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000) diff --git a/math/aarch64/advsimd/log10f.c b/math/aarch64/advsimd/log10f.c new file mode 100644 index 000000000000..907c1051e086 --- /dev/null +++ b/math/aarch64/advsimd/log10f.c @@ -0,0 +1,106 @@ +/* + * Single-precision vector log10 function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c2, c4, c6, inv_ln10, ln2; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; +} data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + .c0 = V4 (-0x1.bcb79cp-3f), + .c1 = 0x1.2879c8p-3f, + .c2 = V4 (-0x1.bcd472p-4f), + .c3 = 0x1.6408f8p-4f, + .c4 = V4 (-0x1.246f8p-4f), + .c5 = 0x1.f0e514p-5f, + .c6 = V4 (-0x1.0fc92cp-4f), + .c7 = 0x1.f5f76ap-5f, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); +} + +/* Fast implementation of AdvSIMD log10f, + uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and + an order 9 polynomial. + Maximum error: 3.305ulps (nearest rounding.) + _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t c1357 = vld1q_f32 (&d->c1); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log10(1+r) + n * log10(2). */ + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + + float32x4_t p47 = vfmaq_f32 (c45, r2, c67); + float32x4_t p27 = vfmaq_f32 (c23, r2, p47); + float32x4_t poly = vfmaq_f32 (c01, r2, p27); + + /* y = Log10(2) * n + poly * InvLn(10). */ + float32x4_t y = vfmaq_f32 (r, d->ln2, n); + y = vmulq_f32 (y, d->inv_ln10); + + if (unlikely (v_any_u16h (special))) + return special_case (y, u_off, poly, r2, special, d); + return vfmaq_f32 (y, poly, r2); +} + +HALF_WIDTH_ALIAS_F1 (log10) + +TEST_SIG (V, F, 1, log10, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log10), 2.81) +TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100) +TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100) +TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000) diff --git a/math/aarch64/advsimd/log1p.c b/math/aarch64/advsimd/log1p.c new file mode 100644 index 000000000000..42a0c5793920 --- /dev/null +++ b/math/aarch64/advsimd/log1p.c @@ -0,0 +1,61 @@ +/* + * Double-precision vector log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data d; + uint64x2_t inf, minus_one; +} data = { .d = V_LOG1P_CONSTANTS_TABLE, + .inf = V2 (0x7ff0000000000000), + .minus_one = V2 (0xbff0000000000000) }; + +#define BottomMask v_u64 (0xffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, uint64x2_t cmp, const struct data *d) +{ + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float64x2_t x_nospecial = v_zerofy_f64 (x, cmp); + return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp); +} + +/* Vector log1p approximation using polynomial on reduced interval. Routine is + a modification of the algorithm used in scalar log1p, with no shortcut for + k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP: + _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2 + want 0x1.fd61d0727429fp+2 . */ +VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + uint64x2_t special_cases + = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one)); + + if (unlikely (v_any_u64 (special_cases))) + return special_case (x, special_cases, d); + + return log1p_inline (x, &d->d); +} + +TEST_SIG (V, D, 1, log1p, -0.9, 10.0) +TEST_ULP (V_NAME_D1 (log1p), 1.95) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000) +TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500) diff --git a/math/aarch64/advsimd/log1pf.c b/math/aarch64/advsimd/log1pf.c new file mode 100644 index 000000000000..94b90249128f --- /dev/null +++ b/math/aarch64/advsimd/log1pf.c @@ -0,0 +1,92 @@ +/* + * Single-precision vector log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +#if WANT_SIMD_EXCEPT + +const static struct data +{ + uint32x4_t minus_one, thresh; + struct v_log1pf_data d; +} data = { + .d = V_LOG1PF_CONSTANTS_TABLE, + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */ + .minus_one = V4 (0xbf800000), +}; + +/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ +# define TinyBound v_u32 (0x34000000) + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp, const struct data *d) +{ + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float32x4_t x_nospecial = v_zerofy_f32 (x, cmp); + return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.69 ULP: + _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3 + want 0x1.cfcbdcp-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + uint32x4_t special_cases + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh), + vcgeq_u32 (ix, d->minus_one)); + + if (unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases, d); + + return log1pf_inline (x, &d->d); +} + +#else + +const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp) +{ + return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.63 ULP: + _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3 + want 0x1.fdcb16p-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) +{ + uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)), + vcaleq_f32 (x, v_f32 (0x1p127f))); + + if (unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases); + + return log1pf_inline (x, ptr_barrier (&data)); +} + +#endif + +HALF_WIDTH_ALIAS_F1 (log1p) + +TEST_SIG (V, F, 1, log1p, -0.9, 10.0) +TEST_ULP (V_NAME_F1 (log1p), 1.20) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000) +TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000) +TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000) +TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000) diff --git a/math/aarch64/advsimd/log2.c b/math/aarch64/advsimd/log2.c new file mode 100644 index 000000000000..7d2e44dad2c9 --- /dev/null +++ b/math/aarch64/advsimd/log2.c @@ -0,0 +1,123 @@ +/* + * Double-precision vector log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + float64x2_t c0, c2; + double c1, c3, invln2, c4; +} data = { + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .c0 = V2 (-0x1.71547652b8300p-1), + .c1 = 0x1.ec709dc340953p-2, + .c2 = V2 (-0x1.71547651c8f35p-2), + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */ +}; + +#define N (1 << V_LOG2_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t log2c; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log2c = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Double-precision vector log2 routine. Implements the same algorithm as + vector log10, with coefficients and table entries scaled in extended + precision. The maximum observed error is 2.58 ULP: + _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2); + float64x2_t hi + = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0); + + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log2, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log2), 2.09) +TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000) diff --git a/math/aarch64/advsimd/log2f.c b/math/aarch64/advsimd/log2f.c new file mode 100644 index 000000000000..3053c64bc552 --- /dev/null +++ b/math/aarch64/advsimd/log2f.c @@ -0,0 +1,102 @@ +/* + * Single-precision vector log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c2, c4, c6, c8; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; +} data = { + /* Coefficients generated using Remez algorithm approximate + log2(1+r)/r for r in [ -1/3, 1/3 ]. + rel error: 0x1.c4c4b0cp-26. */ + .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + .c1 = -0x1.715458p-1f, + .c2 = V4 (0x1.ec701cp-2f), + .c3 = -0x1.7171a4p-2f, + .c4 = V4 (0x1.27a0b8p-2f), + .c5 = -0x1.e5143ep-3f, + .c6 = V4 (0x1.9d8ecap-3f), + .c7 = -0x1.c675bp-3f, + .c8 = V4 (0x1.9e495p-3f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); +} + +/* Fast implementation for single precision AdvSIMD log2, + relies on same argument reduction as AdvSIMD logf. + Maximum error: 2.48 ULPs + _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log2(1+r) + n. */ + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); + float32x4_t p48 = vfmaq_f32 (c45, r2, p68); + float32x4_t p28 = vfmaq_f32 (c23, r2, p48); + float32x4_t p = vfmaq_f32 (c01, r2, p28); + + if (unlikely (v_any_u16h (special))) + return special_case (n, u_off, p, r, special, d); + return vfmaq_f32 (n, p, r); +} + +HALF_WIDTH_ALIAS_F1 (log2) + +TEST_SIG (V, F, 1, log2, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log2), 1.99) +TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000) diff --git a/math/aarch64/advsimd/logf.c b/math/aarch64/advsimd/logf.c new file mode 100644 index 000000000000..84705fad05ee --- /dev/null +++ b/math/aarch64/advsimd/logf.c @@ -0,0 +1,88 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t c2, c4, c6, ln2; + uint32x4_t off, offset_lower_bound, mantissa_mask; + uint16x8_t special_bound; + float c1, c3, c5, c0; +} data = { + /* 3.34 ulp error. */ + .c0 = -0x1.3e737cp-3f, + .c1 = 0x1.5a9aa2p-3f, + .c2 = V4 (-0x1.4f9934p-3f), + .c3 = 0x1.961348p-3f, + .c4 = V4 (-0x1.00187cp-2f), + .c5 = 0x1.555d7cp-2f, + .c6 = V4 (-0x1.ffffc8p-2f), + .ln2 = V4 (0x1.62e43p-1f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t c1350 = vld1q_f32 (&d->c1); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + float32x4_t r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); + float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); + p = vfmaq_laneq_f32 (p, r2, c1350, 3); + + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (p, u_off, y, r2, cmp, d); + return vfmaq_f32 (p, y, r2); +} + +HALF_WIDTH_ALIAS_F1 (log) + +TEST_SIG (V, F, 1, log, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log), 2.9) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000) +TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000) +TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000) diff --git a/math/aarch64/advsimd/modf.c b/math/aarch64/advsimd/modf.c new file mode 100644 index 000000000000..da2fcbff8514 --- /dev/null +++ b/math/aarch64/advsimd/modf.c @@ -0,0 +1,33 @@ +/* + * Double-precision vector modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modf algorithm. Produces exact values in all rounding modes. */ +float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int) +{ + /* Get integer component of x. */ + float64x2_t rounded = vrndq_f64 (x); + vst1q_f64 (out_int, rounded); + + /* Subtract integer component from input. */ + uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded)); + + /* Return +0 for integer x. */ + uint64x2_t is_integer = vceqq_f64 (x, rounded); + return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer)); +} + +TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000) + +TEST_ULP (_ZGVnN2vl8_modf_int, 0.0) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000) diff --git a/math/aarch64/advsimd/modff.c b/math/aarch64/advsimd/modff.c new file mode 100644 index 000000000000..0a646b24cb1a --- /dev/null +++ b/math/aarch64/advsimd/modff.c @@ -0,0 +1,34 @@ +/* + * Single-precision vector modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modff algorithm. Produces exact values in all rounding modes. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x, + float *out_int) +{ + /* Get integer component of x. */ + float32x4_t rounded = vrndq_f32 (x); + vst1q_f32 (out_int, rounded); + + /* Subtract integer component from input. */ + uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded)); + + /* Return +0 for integer x. */ + uint32x4_t is_integer = vceqq_f32 (x, rounded); + return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer)); +} + +TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000) + +TEST_ULP (_ZGVnN4vl4_modff_int, 0.0) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000) diff --git a/math/aarch64/advsimd/pow.c b/math/aarch64/advsimd/pow.c new file mode 100644 index 000000000000..db9d6e9ba14b --- /dev/null +++ b/math/aarch64/advsimd/pow.c @@ -0,0 +1,284 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Defines parameters of the approximation and scalar fallback. */ +#include "finite_pow.h" + +#define VecSmallPowX v_u64 (SmallPowX) +#define VecThresPowX v_u64 (ThresPowX) +#define VecSmallPowY v_u64 (SmallPowY) +#define VecThresPowY v_u64 (ThresPowY) + +static const struct data +{ + uint64x2_t inf; + float64x2_t small_powx; + uint64x2_t offset, mask; + uint64x2_t mask_sub_0, mask_sub_1; + float64x2_t log_c0, log_c2, log_c4, log_c5; + double log_c1, log_c3; + double ln2_lo, ln2_hi; + uint64x2_t small_exp, thres_exp; + double ln2_lo_n, ln2_hi_n; + double inv_ln2_n, exp_c2; + float64x2_t exp_c0, exp_c1; +} data = { + /* Power threshold. */ + .inf = V2 (0x7ff0000000000000), + .small_powx = V2 (0x1p-126), + .offset = V2 (Off), + .mask = V2 (0xfffULL << 52), + .mask_sub_0 = V2 (1ULL << 52), + .mask_sub_1 = V2 (52ULL << 52), + /* Coefficients copied from v_pow_log_data.c + relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .log_c0 = V2 (0x1.555555555556p-2 * -2), + .log_c1 = -0x1.0000000000006p-2 * -2, + .log_c2 = V2 (0x1.999999959554ep-3 * 4), + .log_c3 = -0x1.555555529a47ap-3 * 4, + .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8), + .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8), + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 + (0.550 without fma) if |x| < ln2/512. */ + .exp_c0 = V2 (0x1.fffffffffffd4p-2), + .exp_c1 = V2 (0x1.5555571d6ef9p-3), + .exp_c2 = 0x1.5555576a5adcep-5, + .small_exp = V2 (0x3c90000000000000), + .thres_exp = V2 (0x03f0000000000000), + .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */ + .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */ + .ln2_lo_n = -0x1.c610ca86c3899p-45, +}; + +/* This version implements an algorithm close to scalar pow but + - does not implement the trick in the exp's specialcase subroutine to avoid + double-rounding, + - does not use a tail in the exponential core computation, + - and pow's exp polynomial order and table bits might differ. + + Maximum measured error is 1.04 ULPs: + _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13) + got 0x1.f71162f473251p-1 + want 0x1.f71162f473252p-1. */ + +static inline float64x2_t +v_masked_lookup_f64 (const double *table, uint64x2_t i) +{ + return (float64x2_t){ + table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)], + table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)] + }; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline float64x2_t +v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) +{ + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64x2_t tmp = vsubq_u64 (ix, d->offset); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + float64x2_t kd = vcvtq_f64_s64 (k); + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp); + float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp); + float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp); + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); + /* k*Ln2 + log(c) + r. */ + float64x2_t ln2 = vld1q_f64 (&d->ln2_lo); + float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1); + float64x2_t t2 = vaddq_f64 (t1, r); + float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0); + float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); + /* Evaluation is optimized assuming superscalar pipelined execution. */ + float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r); + float64x2_t ar2 = vmulq_f64 (r, ar); + float64x2_t ar3 = vmulq_f64 (r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + float64x2_t hi = vaddq_f64 (t2, ar2); + float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); + float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1); + float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5); + float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1); + float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0); + float64x2_t p = vfmaq_f64 (a34, ar2, a56); + p = vfmaq_f64 (a12, ar2, p); + p = vmulq_f64 (ar3, p); + float64x2_t lo + = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p); + float64x2_t y = vaddq_f64 (hi, lo); + *tail = vaddq_f64 (vsubq_f64 (hi, y), lo); + return y; +} + +static float64x2_t VPCS_ATTR NOINLINE +exp_special_case (float64x2_t x, float64x2_t xtail) +{ + return (float64x2_t){ exp_nosignbias (x[0], xtail[0]), + exp_nosignbias (x[1], xtail[1]) }; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ +static inline float64x2_t +v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d) +{ + /* Fallback to scalar exp_inline for all lanes if any lane + contains value of x s.t. |x| <= 2^-54 or >= 512. */ + uint64x2_t uoflowx = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp), + d->thres_exp); + if (unlikely (v_any_u64 (uoflowx))) + return exp_special_case (x, vnegq_f64 (neg_xtail)); + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n); + float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0); + float64x2_t kd = vrndnq_f64 (z); + uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z)); + float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n); + float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1); + r = vfmsq_laneq_f64 (r, kd, ln2_n, 0); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = vsubq_f64 (r, neg_xtail); + /* 2^(k/N) ~= scale. */ + uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); + uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64x2_t sbits = v_lookup_u64 (SBits, idx); + sbits = vaddq_u64 (sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1); + tmp = vfmaq_f64 (d->exp_c0, r, tmp); + tmp = vfmaq_f64 (r, r2, tmp); + float64x2_t scale = vreinterpretq_f64_u64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return vfmaq_f64 (scale, scale, tmp); +} + +static float64x2_t NOINLINE VPCS_ATTR +scalar_fallback (float64x2_t x, float64x2_t y) +{ + return (float64x2_t){ pow_scalar_special_case (x[0], y[0]), + pow_scalar_special_case (x[1], y[1]) }; +} + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + /* Case of x <= 0 is too complicated to be vectorised efficiently here, + fallback to scalar pow for all lanes if any x < 0 detected. */ + if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x)))) + return scalar_fallback (x, y); + + uint64x2_t vix = vreinterpretq_u64_f64 (x); + uint64x2_t viy = vreinterpretq_u64_f64 (y); + uint64x2_t iay = vandq_u64 (viy, d->inf); + + /* Special cases of x or y. */ +#if WANT_SIMD_EXCEPT + /* Small or large. */ + uint64x2_t vtopx = vshrq_n_u64 (vix, 52); + uint64x2_t vabstopy = vshrq_n_u64 (iay, 52); + uint64x2_t specialx + = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); + uint64x2_t specialy + = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); +#else + /* The case y==0 does not trigger a special case, since in this case it is + necessary to fix the result only if x is a signalling nan, which already + triggers a special case. We test y==0 directly in the scalar fallback. */ + uint64x2_t iax = vandq_u64 (vix, d->inf); + uint64x2_t specialx = vcgeq_u64 (iax, d->inf); + uint64x2_t specialy = vcgeq_u64 (iay, d->inf); +#endif + uint64x2_t special = vorrq_u64 (specialx, specialy); + /* Fallback to scalar on all lanes if any lane is inf or nan. */ + if (unlikely (v_any_u64 (special))) + return scalar_fallback (x, y); + + /* Small cases of x: |x| < 0x1p-126. */ + uint64x2_t smallx = vcaltq_f64 (x, d->small_powx); + if (unlikely (v_any_u64 (smallx))) + { + /* Update ix if top 12 bits of x are 0. */ + uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52)); + if (unlikely (v_any_u64 (sub_x))) + { + /* Normalize subnormal x so exponent becomes negative. */ + uint64x2_t vix_norm = vreinterpretq_u64_f64 ( + vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0)))); + vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1); + vix = vbslq_u64 (sub_x, vix_norm, vix); + } + } + + /* Vector Log(ix, &lo). */ + float64x2_t vlo; + float64x2_t vhi = v_log_inline (vix, &vlo, d); + + /* Vector Exp(y_loghi, y_loglo). */ + float64x2_t vehi = vmulq_f64 (y, vhi); + float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); + float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo); + return v_exp_inline (vehi, neg_velo, d); +} + +TEST_SIG (V, D, 2, pow) +TEST_ULP (V_NAME_D2 (pow), 0.55) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +/* around argmaxs of ULP error. */ +V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* 1.0^y. */ +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/math/aarch64/advsimd/powf.c b/math/aarch64/advsimd/powf.c new file mode 100644 index 000000000000..47f74cf38ab0 --- /dev/null +++ b/math/aarch64/advsimd/powf.c @@ -0,0 +1,209 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A d->log2_poly +#define C d->exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct data +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + float64x2_t log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + float64x2_t exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + V2 (-0x1.6ff5daa3b3d7cp-2 * Scale), + V2 (0x1.ec81d03c01aebp-2 * Scale), + V2 (-0x1.71547bb43f101p-1 * Scale), + V2 (0x1.7154764a815cbp0 * Scale)}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale), + V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale), + V2 (0x1.62e42ff0c52d6p-1 / Scale)}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +static inline float64x2_t +ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k, + float64x2_t invc, float64x2_t logc, float64x2_t y) +{ + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc); + float64x2_t y0 = vaddq_f64 (logc, k); + + /* Polynomial to approximate log1p(r)/ln2. */ + float64x2_t logx = vfmaq_f64 (A[1], r, A[0]); + logx = vfmaq_f64 (A[2], logx, r); + logx = vfmaq_f64 (A[3], logx, r); + logx = vfmaq_f64 (y0, logx, r); + + return vmulq_f64 (logx, y); +} + +static inline float64x2_t +log2_lookup (const struct data *d, uint32_t i) +{ + return vld1q_f64 ( + &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc); +} + +static inline uint64x1_t +exp2f_lookup (const struct data *d, uint64_t i) +{ + return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]); +} + +static inline float32x2_t +powf_core (const struct data *d, float64x2_t ylogx) +{ + /* N*x = k + r with r in [-1/2, 1/2]. */ + float64x2_t kd = vrndnq_f64 (ylogx); + int64x2_t ki = vcvtaq_s64_f64 (ylogx); + float64x2_t r = vsubq_f64 (ylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)), + exp2f_lookup (d, vgetq_lane_s64 (ki, 1))); + t = vaddq_u64 ( + t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS))); + float64x2_t s = vreinterpretq_f64_u64 (t); + float64x2_t p = vfmaq_f64 (C[1], r, C[0]); + p = vfmaq_f64 (C[2], r, p); + p = vfmaq_f64 (s, p, vmulq_f64 (s, r)); + return vcvt_f32_f64 (p); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top)); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + /* Use double precision for each lane: split input vectors into lo and hi + halves and promote. */ + float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)), + tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)), + tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)), + tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3)); + + float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)), + iz_hi = vcvt_high_f64_f32 (iz); + + float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))), + k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k)); + + float64x2_t invc_lo = vzip1q_f64 (tab0, tab1), + invc_hi = vzip1q_f64 (tab2, tab3), + logc_lo = vzip2q_f64 (tab0, tab1), + logc_hi = vzip2q_f64 (tab2, tab3); + + float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)), + y_hi = vcvt_high_f64_f32 (y); + + float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo); + float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi); + + uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo), + vreinterpretq_u32_f64 (ylogx_hi)); + + cmp = vorrq_u32 ( + cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)), + vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) + >> 47))); + + float32x2_t p_lo = powf_core (d, ylogx_lo); + float32x2_t p_hi = powf_core (d, ylogx_hi); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp); + return vcombine_f32 (p_lo, p_hi); +} + +HALF_WIDTH_ALIAS_F2 (pow) + +TEST_SIG (V, F, 2, pow) +TEST_ULP (V_NAME_F2 (pow), 2.1) +TEST_DISABLE_FENV (V_NAME_F2 (pow)) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000) diff --git a/math/aarch64/v_sin.c b/math/aarch64/advsimd/sin.c index 04129c31133d..0461bbb99405 100644 --- a/math/aarch64/v_sin.c +++ b/math/aarch64/advsimd/sin.c @@ -1,17 +1,19 @@ /* * Double-precision vector sin function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "test_defs.h" +#include "test_sig.h" #include "mathlib.h" #include "v_math.h" static const struct data { float64x2_t poly[7]; - float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), @@ -23,12 +25,13 @@ static const struct data .pi_1 = V2 (0x1.921fb54442d18p+1), .pi_2 = V2 (0x1.1a62633145c06p-53), .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), }; #if WANT_SIMD_EXCEPT -# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ -# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ +# define TinyBound v_u64 (0x3020000000000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u64 (0x1160000000000000) #endif #define C(i) d->poly[i] @@ -61,16 +64,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) fenv). These lanes will be fixed by special-case handler later. */ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); - r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); + r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); #else r = x; cmp = vcageq_f64 (x, d->range_val); #endif /* n = rint(|x|/pi). */ - n = vfmaq_f64 (d->shift, d->inv_pi, r); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); + n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f64 (r, d->pi_1, n); @@ -95,3 +97,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } + +TEST_SIG (V, D, 1, sin, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (sin), 3.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000) diff --git a/math/aarch64/advsimd/sincos.c b/math/aarch64/advsimd/sincos.c new file mode 100644 index 000000000000..83bfa45efa98 --- /dev/null +++ b/math/aarch64/advsimd/sincos.c @@ -0,0 +1,67 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include <math.h> + +#include "v_math.h" +#include "test_defs.h" +#include "v_sincos_common.h" + +/* sincos not available for all scalar libm implementations. */ +#if defined(_MSC_VER) || !defined(__GLIBC__) +static void +sincos (double x, double *out_sin, double *out_cos) +{ + *out_sin = sin (x); + *out_cos = cos (x); +} +#endif + +static void VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, double *out_sin, + double *out_cos) +{ + if (special[0]) + sincos (x[0], out_sin, out_cos); + if (special[1]) + sincos (x[1], out_sin + 1, out_cos + 1); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); + + if (unlikely (v_any_u64 (special))) + special_case (x, special, out_sin, out_cos); +} + +TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos) +TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin) +TEST_ULP (_ZGVnN2v_sincos_sin, 2.73) +TEST_ULP (_ZGVnN2v_sincos_cos, 2.73) +#define V_SINCOS_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n) +V_SINCOS_INTERVAL (0, 0x1p-31, 50000) +V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000) +V_SINCOS_INTERVAL (0x1p23, inf, 10000) diff --git a/math/aarch64/advsimd/sincosf.c b/math/aarch64/advsimd/sincosf.c new file mode 100644 index 000000000000..cd482f38d5f6 --- /dev/null +++ b/math/aarch64/advsimd/sincosf.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include <math.h> + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "test_defs.h" + +/* sincos not available for all scalar libm implementations. */ +#if defined(_MSC_VER) || !defined(__GLIBC__) +static void +sincosf (float x, float *out_sin, float *out_cos) +{ + *out_sin = sinf (x); + *out_cos = cosf (x); +} +#endif + +static void VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float *out_sin, + float *out_cos) +{ + for (int i = 0; i < 4; i++) + if (special[i]) + sincosf (x[i], out_sin + i, out_cos + i); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); + + if (unlikely (v_any_u32 (special))) + special_case (x, special, out_sin, out_cos); +} + +TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin) +TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos) +TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17) +TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31) +#define V_SINCOSF_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n) +V_SINCOSF_INTERVAL (0, 0x1p-31, 50000) +V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000) +V_SINCOSF_INTERVAL (0x1p20, inf, 10000) diff --git a/math/aarch64/advsimd/sincospi.c b/math/aarch64/advsimd/sincospi.c new file mode 100644 index 000000000000..fd425202ce67 --- /dev/null +++ b/math/aarch64/advsimd/sincospi.c @@ -0,0 +1,44 @@ +/* + * Double-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_sincospi_common.h" +#include "v_math.h" +#include "test_defs.h" + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using separate argument reduction and shared low-order + polynomials. + Approximation for vector double-precision sincospi(x). + Maximum Error 3.09 ULP: + _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1 + Maximum Error 3.16 ULP: + _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data); + + float64x2x2_t sc = v_sincospi_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos) +TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin) +TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59) +TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66) +# define V_SINCOSPI_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n) +V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000) +V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000) +V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000) +V_SINCOSPI_INTERVAL (0x1p63, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/sincospif.c b/math/aarch64/advsimd/sincospif.c new file mode 100644 index 000000000000..760ea3d4f5e1 --- /dev/null +++ b/math/aarch64/advsimd/sincospif.c @@ -0,0 +1,43 @@ +/* + * Single-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincospif_common.h" +#include "v_math.h" +#include "test_defs.h" +#include "mathlib.h" + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data); + + float32x4x2_t sc = v_sincospif_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin) +TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos) +TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54) +TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68) +# define V_SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n) +V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000) +V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000) +V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000) +V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000) +#endif diff --git a/math/aarch64/v_sinf.c b/math/aarch64/advsimd/sinf.c index 336879844459..0764434039a0 100644 --- a/math/aarch64/v_sinf.c +++ b/math/aarch64/advsimd/sinf.c @@ -1,17 +1,19 @@ /* * Single-precision vector sin function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { float32x4_t poly[4]; - float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* 1.886 ulp error. */ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), @@ -22,13 +24,14 @@ static const struct data .pi_3 = V4 (-0x1.ee59dap-49f), .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), .range_val = V4 (0x1p20f) }; #if WANT_SIMD_EXCEPT -# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ -# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ +# define TinyBound v_u32 (0x22000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u32 (0x27800000) #endif #define C(i) d->poly[i] @@ -41,7 +44,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) return v_call_f32 (sinf, x, y, cmp); } -float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) { const struct data *d = ptr_barrier (&data); float32x4_t n, r, r2, y; @@ -53,23 +56,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) /* If fenv exceptions are to be triggered correctly, set any special lanes to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by special-case handler later. */ - r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); + r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); #else r = x; cmp = vcageq_f32 (x, d->range_val); #endif - /* n = rint(|x|/pi) */ - n = vfmaq_f32 (d->shift, d->inv_pi, r); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); + /* n = rint(|x|/pi). */ + n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f32 (r, d->pi_1, n); r = vfmsq_f32 (r, d->pi_2, n); r = vfmsq_f32 (r, d->pi_3, n); - /* y = sin(r) */ + /* y = sin(r). */ r2 = vmulq_f32 (r, r); y = vfmaq_f32 (C (2), C (3), r2); y = vfmaq_f32 (C (1), y, r2); @@ -80,3 +82,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } + +HALF_WIDTH_ALIAS_F1 (sin) + +TEST_SIG (V, F, 1, sin, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (sin), 1.4) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000) diff --git a/math/aarch64/advsimd/sinh.c b/math/aarch64/advsimd/sinh.c new file mode 100644 index 000000000000..f65ccd0c6270 --- /dev/null +++ b/math/aarch64/advsimd/sinh.c @@ -0,0 +1,80 @@ +/* + * Double-precision vector sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; + uint64x2_t halff; +#if WANT_SIMD_EXCEPT + uint64x2_t tiny_bound, thresh; +#else + float64x2_t large_bound; +#endif +} data = { + .d = V_EXPM1_DATA, + .halff = V2 (0x3fe0000000000000), +#if WANT_SIMD_EXCEPT + /* 2^-26, below which sinh(x) rounds to x. */ + .tiny_bound = V2 (0x3e50000000000000), + /* asuint(large_bound) - asuint(tiny_bound). */ + .thresh = V2 (0x0230000000000000), +#else + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x1p+9), +#endif +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x) +{ + return v_call_f64 (sinh, x, x, v_u64 (-1)); +} + +/* Approximation for vector double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.52 ULP: + _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2 + want -0x1.ac2f05bb66fc9p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + float64x2_t halfsign = vreinterpretq_f64_u64 ( + vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff)); + +#if WANT_SIMD_EXCEPT + uint64x2_t special = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); +#else + uint64x2_t special = vcageq_f64 (x, d->large_bound); +#endif + + /* Fall back to scalar variant for all lanes if any of them are special. */ + if (unlikely (v_any_u64 (special))) + return special_case (x); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + float64x2_t t = expm1_inline (ax, &d->d); + t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); + return vmulq_f64 (t, halfsign); +} + +TEST_SIG (V, D, 1, sinh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (sinh), 2.02) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/math/aarch64/advsimd/sinhf.c b/math/aarch64/advsimd/sinhf.c new file mode 100644 index 000000000000..12dbe26b425b --- /dev/null +++ b/math/aarch64/advsimd/sinhf.c @@ -0,0 +1,84 @@ +/* + * Single-precision vector sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data expm1f_consts; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound, thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + .expm1f_consts = V_EXPM1F_DATA, +#if WANT_SIMD_EXCEPT + /* 0x1.6a09e8p-32, below which expm1f underflows. */ + .tiny_bound = V4 (0x2fb504f4), + /* asuint(oflow_bound) - asuint(tiny_bound). */ + .thresh = V4 (0x12fbbbb3), +#else + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .oflow_bound = V4 (0x1.61814ep+6), +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign, + uint32x4_t special) +{ + return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special); +} + +/* Approximation for vector single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + float32x4_t halfsign = vreinterpretq_f32_u32 ( + vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5)))); + +#if WANT_SIMD_EXCEPT + uint32x4_t special = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh); + ax = v_zerofy_f32 (ax, special); +#else + uint32x4_t special = vcageq_f32 (x, d->oflow_bound); +#endif + + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us + to retain acceptable accuracy for very small inputs. */ + float32x4_t t = expm1f_inline (ax, &d->expm1f_consts); + t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0)))); + + /* Fall back to the scalar variant for any lanes that should trigger an + exception. */ + if (unlikely (v_any_u32 (special))) + return special_case (x, t, halfsign, special); + + return vmulq_f32 (t, halfsign); +} + +HALF_WIDTH_ALIAS_F1 (sinh) + +TEST_SIG (V, F, 1, sinh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (sinh), 1.76) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/math/aarch64/advsimd/sinpi.c b/math/aarch64/advsimd/sinpi.c new file mode 100644 index 000000000000..f86d167a2ac3 --- /dev/null +++ b/math/aarch64/advsimd/sinpi.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector sinpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */ +/* asuint64(0x1p64) - TinyBound. */ +# define Thresh v_u64 (0x07f0000000000000) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (arm_math_sinpi, x, y, cmp); +} +#endif + +/* Approximation for vector double-precision sinpi(x). + Maximum Error 3.05 ULP: + _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1 + want 0x1.fb295878301cap-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float64x2_t r = v_zerofy_f64 (x, cmp); +#else + float64x2_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_D1 (sinpi), 2.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/sinpif.c b/math/aarch64/advsimd/sinpif.c new file mode 100644 index 000000000000..98ba9d84d2fb --- /dev/null +++ b/math/aarch64/advsimd/sinpif.c @@ -0,0 +1,84 @@ +/* + * Single-precision vector sinpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */ +# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (arm_math_sinpif, x, y, cmp); +} +#endif + +/* Approximation for vector single-precision sinpi(x) + Maximum Error 3.03 ULP: + _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float32x4_t r = v_zerofy_f32 (x, cmp); +#else + float32x4_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +HALF_WIDTH_ALIAS_F1 (sinpi) + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_F1 (sinpi), 2.54) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/tan.c b/math/aarch64/advsimd/tan.c new file mode 100644 index 000000000000..957f9aba3a1e --- /dev/null +++ b/math/aarch64/advsimd/tan.c @@ -0,0 +1,122 @@ +/* + * Double-precision vector tan(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[9]; + double half_pi[2]; + float64x2_t two_over_pi, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3), + V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6), + V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), + V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), + V2 (0x1.4e4fd14147622p-12) }, + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, + .two_over_pi = V2 (0x1.45f306dc9c883p-1), + .shift = V2 (0x1.8p52), +#if !WANT_SIMD_EXCEPT + .range_val = V2 (0x1p23), +#endif +}; + +#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */ +#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */ +#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */ + +/* Special cases (fall back to scalar calls). */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x) +{ + return v_call_f64 (tan, x, x, v_u64 (-1)); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + /* Our argument reduction cannot calculate q with sufficient accuracy for + very large inputs. Fall back to scalar routine for all lanes if any are + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for + tiny input to avoid underflow. */ +#if WANT_SIMD_EXCEPT + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + /* iax - tiny_bound > range_val - tiny_bound. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh)); + if (unlikely (v_any_u64 (special))) + return special_case (x); +#endif + + /* q = nearest integer to 2 * x / pi. */ + float64x2_t q + = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift); + int64x2_t qi = vcvtq_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; + float64x2_t half_pi = vld1q_f64 (dat->half_pi); + r = vfmsq_laneq_f64 (r, q, half_pi, 0); + r = vfmsq_laneq_f64 (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = vmulq_n_f64 (r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2), + r8 = vmulq_f64 (r4, r4); + /* Offset coefficients to evaluate from C1 onwards. */ + float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1); + p = vfmaq_f64 (dat->poly[0], p, r2); + p = vfmaq_f64 (r, r2, vmulq_f64 (p, r)); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). + */ + float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); + float64x2_t d = vaddq_f64 (p, p); + + uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); + +#if !WANT_SIMD_EXCEPT + uint64x2_t special = vcageq_f64 (x, dat->range_val); + if (unlikely (v_any_u64 (special))) + return special_case (x); +#endif + + return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)), + vbslq_f64 (no_recip, d, n)); +} + +TEST_SIG (V, D, 1, tan, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (tan), 2.99) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000) diff --git a/math/aarch64/advsimd/tanf.c b/math/aarch64/advsimd/tanf.c new file mode 100644 index 000000000000..ed5448649f6c --- /dev/null +++ b/math/aarch64/advsimd/tanf.c @@ -0,0 +1,130 @@ +/* + * Single-precision vector tan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[6]; + float pi_consts[4]; + float32x4_t shift; +#if !WANT_SIMD_EXCEPT + float32x4_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), + V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ + .pi_consts + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, + .shift = V4 (0x1.8p+23f), +#if !WANT_SIMD_EXCEPT + .range_val = V4 (0x1p15f), +#endif +}; + +#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */ +#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */ +#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */ + +/* Special cases (fall back to scalar calls). */ +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + return v_call_f32 (tanf, x, y, cmp); +} + +/* Use a full Estrin scheme to evaluate polynomial. */ +static inline float32x4_t +eval_poly (float32x4_t z, const struct data *d) +{ + float32x4_t z2 = vmulq_f32 (z, z); +#if WANT_SIMD_EXCEPT + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. + If fp exceptions are to be triggered correctly, + sidestep this by fixing such lanes to 0. */ + uint32x4_t will_uflow + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); + if (unlikely (v_any_u32 (will_uflow))) + z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); +#endif + float32x4_t z4 = vmulq_f32 (z2, z2); + return v_estrin_5_f32 (z, z2, z4, d->poly); +} + +/* Fast implementation of AdvSIMD tanf. + Maximum error is 3.45 ULP: + __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t special_arg = x; + + /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast + regression. */ +#if WANT_SIMD_EXCEPT + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + /* If fp exceptions are to be triggered correctly, also special-case tiny + input, as this will load to overflow later. Fix any special lanes to 1 to + prevent any exceptions being triggered. */ + uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh); + if (unlikely (v_any_u32 (special))) + x = vbslq_f32 (special, v_f32 (1.0f), x); +#else + /* Otherwise, special-case large and special values. */ + uint32x4_t special = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(x/(pi/2)). */ + float32x4_t pi_consts = vld1q_f32 (d->pi_consts); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3); + float32x4_t n = vsubq_f32 (q, d->shift); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); + + /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ + float32x4_t r; + r = vfmaq_laneq_f32 (x, n, pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, pi_consts, 2); + + /* If x lives in an interval, where |tan(x)| + - is finite, then use a polynomial approximation of the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use symmetries of tangent and the identity + tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use + the same polynomial approximation of tan as above. */ + + /* Invert sign of r if odd quadrant. */ + float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1))); + + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ + float32x4_t z2 = vmulq_f32 (r, r); + float32x4_t p = eval_poly (z2, d); + float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* Compute reciprocal and apply if required. */ + float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y); + + if (unlikely (v_any_u32 (special))) + return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special); + return vbslq_f32 (pred_alt, inv_y, y); +} + +HALF_WIDTH_ALIAS_F1 (tan) + +TEST_SIG (V, F, 1, tan, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (tan), 2.96) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000) diff --git a/math/aarch64/advsimd/tanh.c b/math/aarch64/advsimd/tanh.c new file mode 100644 index 000000000000..3dc6e5527ffc --- /dev/null +++ b/math/aarch64/advsimd/tanh.c @@ -0,0 +1,67 @@ +/* + * Double-precision vector tanh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; + uint64x2_t thresh, tiny_bound; +} data = { + .d = V_EXPM1_DATA, + .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = V2 (0x01f241bf835f9d5f), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t q, float64x2_t qp2, + uint64x2_t special) +{ + return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special); +} + +/* Vector approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.70 ULP: + _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3 + want -0x1.be5452a6459fbp-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + float64x2_t u = x; + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); +#if WANT_SIMD_EXCEPT + /* To trigger fp exceptions correctly, set special lanes to a neutral value. + They will be fixed up later by the special-case handler. */ + if (unlikely (v_any_u64 (special))) + u = v_zerofy_f64 (u, special); +#endif + + u = vaddq_f64 (u, u); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float64x2_t q = expm1_inline (u, &d->d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0)); + + if (unlikely (v_any_u64 (special))) + return special_case (x, q, qp2, special); + return vdivq_f64 (q, qp2); +} + +TEST_SIG (V, D, 1, tanh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (tanh), 2.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/math/aarch64/advsimd/tanhf.c b/math/aarch64/advsimd/tanhf.c new file mode 100644 index 000000000000..18fe93c7e7ba --- /dev/null +++ b/math/aarch64/advsimd/tanhf.c @@ -0,0 +1,81 @@ +/* + * Single-precision vector tanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data expm1f_consts; + uint32x4_t boring_bound, large_bound, onef; +} data = { + .expm1f_consts = V_EXPM1F_DATA, + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ + .boring_bound = V4 (0x41102cb3), + .large_bound = V4 (0x7f800000), +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring, + float32x4_t q, uint32x4_t special) +{ + return v_call_f32 ( + tanhf, x, + vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))), + special); +} + +/* Approximation for single-precision vector tanh(x), using a simplified + version of expm1f. The maximum error is 2.58 ULP: + _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t sign = veorq_u32 (ix, iax); + uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); + /* expm1 exponent bias is 1.0f reinterpreted to int. */ + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 ( + sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias))); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered properly, set all special and boring + lanes to 0, which will trigger no exceptions, and fix them up later. */ + uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound), + vcltq_u32 (iax, v_u32 (0x34000000))); + x = v_zerofy_f32 (x, is_boring); + if (unlikely (v_any_u32 (special))) + x = v_zerofy_f32 (x, special); +#else + uint32x4_t special = vcgtq_u32 (iax, d->large_bound); +#endif + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); + + if (unlikely (v_any_u32 (special))) + return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q, + special); + + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); + return vbslq_f32 (is_boring, boring, y); +} + +HALF_WIDTH_ALIAS_F1 (tanh) + +TEST_SIG (V, F, 1, tanh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (tanh), 2.09) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/math/aarch64/advsimd/tanpi.c b/math/aarch64/advsimd/tanpi.c new file mode 100644 index 000000000000..16de00ad5556 --- /dev/null +++ b/math/aarch64/advsimd/tanpi.c @@ -0,0 +1,88 @@ +/* + * Double-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpi_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = V2 (0x1.921fb54442d18p1), /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5), + .c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9), + .c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13), + .c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17), + .c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21), + .c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27), + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + float64x2_t n = vrndnq_f64 (x); + + /* inf produces nan that propagates. */ + float64x2_t xr = vsubq_f64 (x, n); + float64x2_t ar = vabdq_f64 (x, n); + uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25)); + float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar); + + /* Order-14 pairwise Horner. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + + float64x2_t c_1_3 = vld1q_f64 (&d->c1); + float64x2_t c_5_7 = vld1q_f64 (&d->c5); + float64x2_t c_9_11 = vld1q_f64 (&d->c9); + float64x2_t c_13_14 = vld1q_f64 (&d->c13); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0); + + float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1); + p = vfmaq_f64 (p1011, r4, p); + p = vfmaq_f64 (p89, r4, p); + p = vfmaq_f64 (p67, r4, p); + p = vfmaq_f64 (p45, r4, p); + p = vfmaq_f64 (p23, r4, p); + p = vfmaq_f64 (p01, r4, p); + p = vmulq_f64 (r, p); + + float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p); + float64x2_t y = vbslq_f64 (flip, p_recip, p); + + uint64x2_t sign + = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar)); + return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (V_NAME_D1 (tanpi)) +TEST_ULP (V_NAME_D1 (tanpi), 2.57) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000) +#endif diff --git a/math/aarch64/advsimd/tanpif.c b/math/aarch64/advsimd/tanpif.c new file mode 100644 index 000000000000..7bd6d206819f --- /dev/null +++ b/math/aarch64/advsimd/tanpif.c @@ -0,0 +1,70 @@ +/* + * Single-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpif_data +{ + float32x4_t c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficents for tan(pi * x). */ + .c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f), + .c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f, + .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpi(x) + The maximum error is 3.34 ULP: + _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + + float32x4_t n = vrndnq_f32 (x); + + /* inf produces nan that propagates. */ + float32x4_t xr = vsubq_f32 (x, n); + float32x4_t ar = vabdq_f32 (x, n); + uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f)); + float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar); + + /* Order-7 pairwise Horner polynomial evaluation scheme. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + + float32x4_t odd_coeffs = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3); + float32x4_t p = vfmaq_f32 (p45, r4, p67); + p = vfmaq_f32 (p23, r4, p); + p = vfmaq_f32 (p01, r4, p); + + p = vmulq_f32 (r, p); + float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p); + float32x4_t y = vbslq_f32 (flip, p_recip, p); + + uint32x4_t sign + = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar)); + return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign)); +} + +HALF_WIDTH_ALIAS_F1 (tanpi) + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (V_NAME_F1 (tanpi)) +TEST_ULP (V_NAME_F1 (tanpi), 2.84) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000) +#endif diff --git a/math/aarch64/advsimd/v_expf_inline.h b/math/aarch64/advsimd/v_expf_inline.h new file mode 100644 index 000000000000..797d217820c3 --- /dev/null +++ b/math/aarch64/advsimd/v_expf_inline.h @@ -0,0 +1,58 @@ +/* + * Helper for single-precision routines which calculate exp(ax) and do not + * need special-case handling + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPF_INLINE_H +#define MATH_V_EXPF_INLINE_H + +#include "v_math.h" + +struct v_expf_data +{ + float ln2_hi, ln2_lo, c0, c2; + float32x4_t inv_ln2, c1, c3, c4; + /* asuint(1.0f). */ + uint32x4_t exponent_bias; +}; + +/* maxerr: 1.45358 +0.5 ulp. */ +#define V_EXPF_DATA \ + { \ + .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \ + .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \ + } + +static inline float32x4_t +v_expf_inline (float32x4_t x, const struct v_expf_data *d) +{ + /* Helper routine for calculating exp(ax). + Copied from v_expf.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t ax = vabsq_f32 (x); + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); + float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + + /* Custom order-4 Estrin avoids building high order monomial. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); + return vfmaq_f32 (scale, poly, scale); +} + +#endif // MATH_V_EXPF_INLINE_H diff --git a/math/aarch64/advsimd/v_expm1_inline.h b/math/aarch64/advsimd/v_expm1_inline.h new file mode 100644 index 000000000000..82d2e9415d93 --- /dev/null +++ b/math/aarch64/advsimd/v_expm1_inline.h @@ -0,0 +1,86 @@ +/* + * Helper for double-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPM1_INLINE_H +#define MATH_V_EXPM1_INLINE_H + +#include "v_math.h" + +struct v_expm1_data +{ + float64x2_t c2, c4, c6, c8; + float64x2_t invln2; + int64x2_t exponent_bias; + double c1, c3, c5, c7, c9, c10; + double ln2[2]; +}; + +/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ +#define V_EXPM1_DATA \ + { \ + .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \ + .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \ + .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \ + .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \ + .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \ + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \ + .invln2 = V2 (0x1.71547652b82fep0), \ + .exponent_bias = V2 (0x3ff0000000000000), \ + } + +static inline float64x2_t +expm1_inline (float64x2_t x, const struct v_expm1_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2)); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); + f = vfmsq_laneq_f64 (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t lane_consts_13 = vld1q_f64 (&d->c1); + float64x2_t lane_consts_57 = vld1q_f64 (&d->c5); + float64x2_t lane_consts_910 = vld1q_f64 (&d->c9); + float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1); + float64x2_t p03 = vfmaq_f64 (p01, f2, p23); + float64x2_t p47 = vfmaq_f64 (p45, f2, p67); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0); + float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1); + p = vfmaq_f64 (p47, f4, p); + p = vfmaq_f64 (p03, f4, p); + + p = vfmaq_f64 (f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); +} + +#endif // MATH_V_EXPM1_INLINE_H diff --git a/math/aarch64/advsimd/v_expm1f_inline.h b/math/aarch64/advsimd/v_expm1f_inline.h new file mode 100644 index 000000000000..463b07aa7705 --- /dev/null +++ b/math/aarch64/advsimd/v_expm1f_inline.h @@ -0,0 +1,62 @@ +/* + * Helper for single-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPM1F_INLINE_H +#define MATH_V_EXPM1F_INLINE_H + +#include "v_math.h" + +struct v_expm1f_data +{ + float32x4_t c0, c2; + int32x4_t exponent_bias; + float c1, c3, inv_ln2, c4; + float ln2_hi, ln2_lo; +}; + +/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, + log(2)/2]. Exponent bias is asuint(1.0f). */ +#define V_EXPM1F_DATA \ + { \ + .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \ + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + } + +static inline float32x4_t +expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float32x2_t ln2 = vld1_f32 (&d->ln2_hi); + float32x4_t lane_consts = vld1q_f32 (&d->c1); + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2)); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0); + f = vfmsq_lane_f32 (f, j, ln2, 1); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */ + float32x4_t f2 = vmulq_f32 (f, f); + float32x4_t f4 = vmulq_f32 (f2, f2); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1); + float32x4_t p = vfmaq_f32 (p01, f2, p23); + p = vfmaq_laneq_f32 (p, f4, lane_consts, 3); + p = vfmaq_f32 (f, f2, p); + + /* t = 2^i. */ + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); +} + +#endif // MATH_V_EXPM1F_INLINE_H diff --git a/math/aarch64/advsimd/v_log1p_inline.h b/math/aarch64/advsimd/v_log1p_inline.h new file mode 100644 index 000000000000..ef906ae4b603 --- /dev/null +++ b/math/aarch64/advsimd/v_log1p_inline.h @@ -0,0 +1,119 @@ +/* + * Helper for vector double-precision routines which calculate log(1 + x) and + * do not need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef MATH_V_LOG1P_INLINE_H +#define MATH_V_LOG1P_INLINE_H + +#include "v_math.h" + +struct v_log1p_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; + int64x2_t one_top; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; + double ln2[2]; +}; + +/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ +#define V_LOG1P_CONSTANTS_TABLE \ + { \ + .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \ + .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \ + .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \ + .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \ + .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \ + .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \ + .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \ + .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \ + .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \ + .c18 = -0x1.cfa7385bdb37ep-6, \ + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \ + .hf_rt2_top = V2 (0x3fe6a09e00000000), \ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ + .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ + } + +#define BottomMask v_u64 (0xffffffff) + +static inline float64x2_t +eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1718 = vld1q_f64 (&d->c17); + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0); + float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1); + p = vfmaq_f64 (p1415, m2, p); + p = vfmaq_f64 (p1213, m2, p); + p = vfmaq_f64 (p1011, m2, p); + p = vfmaq_f64 (p89, m2, p); + p = vfmaq_f64 (p67, m2, p); + p = vfmaq_f64 (p45, m2, p); + p = vfmaq_f64 (p23, m2, p); + return vfmaq_f64 (p01, m2, p); +} + +static inline float64x2_t +log1p_inline (float64x2_t x, const struct v_log1p_data *d) +{ + /* Helper for calculating log(x + 1): + - No special-case handling - this should be dealt with by the caller. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using v_sel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. */ + float64x2_t m = vaddq_f64 (x, v_f64 (1.0)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); + + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0)); + + /* Correction term c/m. */ + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m); + +#ifndef WANT_V_LOG1P_K0_SHORTCUT +# error \ + "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_V_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + uint64x2_t k0 = vceqzq_f64 (k); + cm = v_zerofy_f64 (cm, k0); + f = vbslq_f64 (k0, x, f); +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = eval_poly (f, f2, d); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1); + float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0); + return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); +} + +#endif // MATH_V_LOG1P_INLINE_H diff --git a/math/aarch64/advsimd/v_log1pf_inline.h b/math/aarch64/advsimd/v_log1pf_inline.h new file mode 100644 index 000000000000..e81fa24486ae --- /dev/null +++ b/math/aarch64/advsimd/v_log1pf_inline.h @@ -0,0 +1,94 @@ +/* + * Helper for single-precision routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_LOG1PF_INLINE_H +#define MATH_V_LOG1PF_INLINE_H + +#include "v_math.h" +#include "v_poly_f32.h" + +struct v_log1pf_data +{ + uint32x4_t four; + int32x4_t three_quarters; + float c0, c3, c5, c7; + float32x4_t c4, c6, c1, c2, ln2; +}; + +/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more efficiently. */ +#define V_LOG1PF_CONSTANTS_TABLE \ + { \ + .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \ + .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \ + .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \ + .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \ + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ + .three_quarters = V4 (0x3f400000) \ + } + +static inline float32x4_t +eval_poly (float32x4_t m, const struct v_log1pf_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float32x4_t c0357 = vld1q_f32 (&d->c0); + float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0); + float32x4_t m2 = vmulq_f32 (m, m); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1); + float32x4_t p = vfmaq_f32 (p45, m2, p67); + p = vfmaq_f32 (p23, m2, p); + p = vfmaq_f32 (d->c1, m, p); + p = vmulq_f32 (m2, p); + p = vfmaq_f32 (m, m2, p); + return vfmaq_f32 (p, m2, q); +} + +static inline float32x4_t +log1pf_inline (float32x4_t x, const struct v_log1pf_data *d) +{ + /* Helper for calculating log(x + 1). */ + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); + + /* Scale x by exponent manipulation. */ + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); + + /* Evaluate polynomial on the reduced interval. */ + float32x4_t p = eval_poly (m_scale, d); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); + + /* Apply the scaling back. */ + return vfmaq_f32 (p, scale_back, d->ln2); +} + +#endif // MATH_V_LOG1PF_INLINE_H diff --git a/math/aarch64/advsimd/v_log_inline.h b/math/aarch64/advsimd/v_log_inline.h new file mode 100644 index 000000000000..770f9e81c195 --- /dev/null +++ b/math/aarch64/advsimd/v_log_inline.h @@ -0,0 +1,104 @@ +/* + * Double-precision vector log(x) function - inline version + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "math_config.h" + +#ifndef V_LOG_INLINE_POLY_ORDER +# error Cannot use inline log helper without specifying poly order (options are 4 or 5) +#endif + +#if V_LOG_INLINE_POLY_ORDER == 4 +# define POLY \ + { \ + V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2), \ + V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3) \ + } +#elif V_LOG_INLINE_POLY_ORDER == 5 +# define POLY \ + { \ + V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), \ + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), \ + V2 (-0x1.554e550bd501ep-3) \ + } +#else +# error Can only choose order 4 or 5 for log poly +#endif + +struct v_log_inline_data +{ + float64x2_t poly[V_LOG_INLINE_POLY_ORDER]; + float64x2_t ln2; + uint64x2_t off, sign_exp_mask; +}; + +#define V_LOG_CONSTANTS \ + { \ + .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1), \ + .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000) \ + } + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +log_lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +v_log_inline (float64x2_t x, const struct v_log_inline_data *d) +{ + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, d->off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = log_lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); +#if V_LOG_POLY_ORDER == 5 + y = vfmaq_f64 (y, A (4), r2); +#endif + y = vfmaq_f64 (p, y, r2); + + return vfmaq_f64 (hi, y, r2); +} diff --git a/math/aarch64/advsimd/v_math.h b/math/aarch64/advsimd/v_math.h new file mode 100644 index 000000000000..75cd71cc87a7 --- /dev/null +++ b/math/aarch64/advsimd/v_math.h @@ -0,0 +1,202 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#if !__aarch64__ +# error "Cannot build without AArch64" +#endif + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun +#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f +#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun + +#if USE_GLIBC_ABI + +# define HALF_WIDTH_ALIAS_F1(fun) \ + float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \ + { \ + return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \ + } + +# define HALF_WIDTH_ALIAS_F2(fun) \ + float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \ + { \ + return vget_low_f32 ( \ + _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \ + } + +#else +# define HALF_WIDTH_ALIAS_F1(fun) +# define HALF_WIDTH_ALIAS_F2(fun) +#endif + +#include <stdint.h> +#include "math_config.h" +#include <arm_neon.h> + +/* Shorthand helpers for declaring constants. */ +#define V2(X) \ + { \ + X, X \ + } +#define V4(X) \ + { \ + X, X, X, X \ + } +#define V8(X) \ + { \ + X, X, X, X, X, X, X, X \ + } + +static inline int +v_any_u16h (uint16x4_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; +} + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +static inline int32x4_t +v_s32 (int32_t x) +{ + return (int32x4_t) V4 (x); +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline int +v_any_u32h (uint32x2_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3] }; +} +static inline float32x4_t +v_zerofy_f32 (float32x4_t x, uint32x4_t mask) +{ + return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +static inline int64x2_t +v_s64 (int64_t x) +{ + return (int64x2_t) V2 (x); +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; +} + +static inline float64x2_t +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, + float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1h = x1[1]; + double x2h = x2[1]; + if (likely (p[0])) + y[0] = f (x1[0], x2[0]); + if (likely (p1)) + y[1] = f (x1h, x2h); + return y; +} +static inline float64x2_t +v_zerofy_f64 (float64x2_t x, uint64x2_t mask) +{ + return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); +} + +#endif diff --git a/math/aarch64/advsimd/v_poly_f32.h b/math/aarch64/advsimd/v_poly_f32.h new file mode 100644 index 000000000000..9a9c5c1ac15b --- /dev/null +++ b/math/aarch64/advsimd/v_poly_f32.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on single-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_POLY_ADVSIMD_F32_H +#define MATH_POLY_ADVSIMD_F32_H + +#include <arm_neon.h> + +/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f32. */ +#define VTYPE float32x4_t +#define FMA(x, y, z) vfmaq_f32 (z, x, y) +#define VWRAP(f) v_##f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/math/aarch64/advsimd/v_poly_f64.h b/math/aarch64/advsimd/v_poly_f64.h new file mode 100644 index 000000000000..4331bfbd03b0 --- /dev/null +++ b/math/aarch64/advsimd/v_poly_f64.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on double-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_POLY_ADVSIMD_F64_H +#define MATH_POLY_ADVSIMD_F64_H + +#include <arm_neon.h> + +/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f64. */ +#define VTYPE float64x2_t +#define FMA(x, y, z) vfmaq_f64 (z, x, y) +#define VWRAP(f) v_##f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/math/aarch64/advsimd/v_sincos_common.h b/math/aarch64/advsimd/v_sincos_common.h new file mode 100644 index 000000000000..14227d9339a8 --- /dev/null +++ b/math/aarch64/advsimd/v_sincos_common.h @@ -0,0 +1,86 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" + +static const struct v_sincos_data +{ + float64x2_t sin_poly[7], cos_poly[6], pio2[3]; + float64x2_t inv_pio2, shift, range_val; +} v_sincos_data = { + .inv_pio2 = V2 (0x1.45f306dc9c882p-1), + .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26), + V2 (0x1.1a62633145c07p-54) }, + .shift = V2 (0x1.8p52), + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10), + V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22), + V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) }, + .range_val = V2 (0x1p23), }; + +static inline uint64x2_t +check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d) +{ + return vcagtq_f64 (x, d->range_val); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline float64x2x2_t +v_sincos_inline (float64x2_t x, const struct v_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift); + int64x2_t n = vcvtq_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; + r = vfmsq_f64 (r, q, d->pio2[0]); + r = vfmsq_f64 (r, q, d->pio2[1]); + r = vfmsq_f64 (r, q, d->pio2[2]); + + float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2; + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly); + s = vfmaq_f64 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly); + c = vfmaq_f64 (v_f64 (-0.5), r2, c); + c = vfmaq_f64 (v_f64 (1), r2, c); + + /* If odd quadrant, swap cos and sin. */ + uint64x2_t swap = vtstq_s64 (n, v_s64 (1)); + float64x2_t ss = vbslq_f64 (swap, c, s); + float64x2_t cc = vbslq_f64 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + uint64x2_t sin_sign + = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62); + uint64x2_t cos_sign = vshlq_n_u64 ( + vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)), + 62); + ss = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign)); + cc = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign)); + + return (float64x2x2_t){ ss, cc }; +} diff --git a/math/aarch64/advsimd/v_sincosf_common.h b/math/aarch64/advsimd/v_sincosf_common.h new file mode 100644 index 000000000000..7c29eded14d6 --- /dev/null +++ b/math/aarch64/advsimd/v_sincosf_common.h @@ -0,0 +1,84 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +const static struct v_sincosf_data +{ + float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} v_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) }, + .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) }, + .inv_pio2 = V4 (0x1.45f306p-1f), + .shift = V4 (0x1.8p23), + .range_val = V4 (0x1p20), +}; + +static inline uint32x4_t +check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d) +{ + return vcagtq_f32 (x, d->range_val); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline float32x4x2_t +v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + float32x4_t shift = d->shift; + float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2); + q = vsubq_f32 (q, shift); + int32x4_t n = vcvtq_s32_f32 (q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + float32x4_t r = x; + r = vfmsq_f32 (r, q, d->pio2[0]); + r = vfmsq_f32 (r, q, d->pio2[1]); + r = vfmsq_f32 (r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2); + float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]); + s = vfmaq_f32 (d->poly_sin[0], r2, s); + s = vfmaq_f32 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]); + float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]); + c = vfmaq_f32 (c, r4, p); + c = vfmaq_f32 (v_f32 (1), c, r2); + + /* If odd quadrant, swap cos and sin. */ + uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1)); + float32x4_t ss = vbslq_f32 (swap, c, s); + float32x4_t cc = vbslq_f32 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + uint32x4_t sin_sign + = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30); + uint32x4_t cos_sign = vshlq_n_u32 ( + vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)), + 30); + ss = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign)); + cc = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign)); + + return (float32x4x2_t){ ss, cc }; +} diff --git a/math/aarch64/advsimd/v_sincospi_common.h b/math/aarch64/advsimd/v_sincospi_common.h new file mode 100644 index 000000000000..438b141b9174 --- /dev/null +++ b/math/aarch64/advsimd/v_sincospi_common.h @@ -0,0 +1,64 @@ +/* + * Helper for Double-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "v_poly_f64.h" + +static const struct v_sincospi_data +{ + float64x2_t poly[10], range_val; +} v_sincospi_data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using separate argument reduction and shared low-order + polynomials. + Approximation for vector double-precision sincospi(x). + Maximum Error 3.09 ULP: + _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1 + Maximum Error 3.16 ULP: + _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. */ +static inline float64x2x2_t +v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d) +{ + /* If r is odd, the sign of the result should be inverted for sinpi + and reintroduced for cospi. */ + uint64x2_t cmp = vcgeq_f64 (x, d->range_val); + uint64x2_t odd = vshlq_n_u64 ( + vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63); + + /* r = x - rint(x). */ + float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x)); + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float64x2_t sr2 = vmulq_f64 (sr, sr); + float64x2_t sr4 = vmulq_f64 (sr2, sr2); + float64x2_t cr2 = vmulq_f64 (cr, cr); + float64x2_t cr4 = vmulq_f64 (cr2, cr2); + + float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr); + float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr); + + float64x2_t sinpix + = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd)); + + float64x2_t cospix + = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd)); + + return (float64x2x2_t){ sinpix, cospix }; +} diff --git a/math/aarch64/advsimd/v_sincospif_common.h b/math/aarch64/advsimd/v_sincospif_common.h new file mode 100644 index 000000000000..8d4177dd871e --- /dev/null +++ b/math/aarch64/advsimd/v_sincospif_common.h @@ -0,0 +1,57 @@ +/* + * Helper for Single-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" + +const static struct v_sincospif_data +{ + float32x4_t poly[6], range_val; +} v_sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + */ +static inline float32x4x2_t +v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d) +{ + /* If r is odd, the sign of the result should be inverted for sinpi and + reintroduced for cospi. */ + uint32x4_t cmp = vcgeq_f32 (x, d->range_val); + uint32x4_t odd = vshlq_n_u32 ( + vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31); + + /* r = x - rint(x). */ + float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x)); + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t sr2 = vmulq_f32 (sr, sr); + float32x4_t sr4 = vmulq_f32 (sr2, sr2); + float32x4_t cr2 = vmulq_f32 (cr, cr); + float32x4_t cr4 = vmulq_f32 (cr2, cr2); + + float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr); + float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr); + + float32x4_t sinpix + = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd)); + float32x4_t cospix + = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd)); + + return (float32x4x2_t){ sinpix, cospix }; +} diff --git a/math/aarch64/cospi_3u5.c b/math/aarch64/cospi_3u5.c new file mode 100644 index 000000000000..4131f6c816a1 --- /dev/null +++ b/math/aarch64/cospi_3u5.c @@ -0,0 +1,98 @@ +/* + * Double-precision scalar cospi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +static const double poly[] + = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, + -0x1.012a9870eeb7dp-25 }; + +#define Shift 0x1.8p+52 + +/* Approximation for scalar double-precision cospi(x). + Maximum error: 3.13 ULP: + cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1 + want 0x1.fffffffffd16ep-1. */ +double +arm_math_cospi (double x) +{ + if (isinf (x) || isnan (x)) + return __math_invalid (x); + + double ax = asdouble (asuint64 (x) & ~0x8000000000000000); + + /* Edge cases for when cospif should be exactly 1. (Integers) + 0x1p53 is the limit for single precision to store any decimal places. */ + if (ax >= 0x1p53) + return 1; + + /* If x is an integer, return +- 1, based upon if x is odd. */ + uint64_t m = (uint64_t) ax; + if (m == ax) + return (m & 1) ? -1 : 1; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via + cospi(x) ~= 1. */ + if (ax < 0x1p-63) + return 1; + + /* Any non-integer values >= 0x1x51 will be int +0.5. + These values should return exactly 0. */ + if (ax >= 0x1p51) + return 0; + + /* n = rint(|x|). */ + double n = ax + Shift; + uint64_t sign = asuint64 (n) << 63; + n = n - Shift; + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + double r = 0.5 - fabs (ax - n); + + /* y = sin(r). */ + double r2 = r * r; + double y = horner_9_f64 (r2, poly); + y = y * r; + + /* Reintroduce C2_hi. */ + y = fma (-4 * r2, r, y); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be + positive, therefore, the sign must be introduced based upon if x rounds to + odd or even. */ + return asdouble (asuint64 (y) ^ sign); +} + +#if WANT_EXPERIMENTAL_MATH +double +cospi (double x) +{ + return arm_math_cospi (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_cospi, 2.63) +TEST_SYM_INTERVAL (arm_math_cospi, 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (arm_math_cospi, 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_cospi, 0.5, 0x1p51f, 10000) +TEST_SYM_INTERVAL (arm_math_cospi, 0x1p51f, inf, 10000) +#endif diff --git a/math/aarch64/cospif_2u6.c b/math/aarch64/cospif_2u6.c new file mode 100644 index 000000000000..eb5b75402a63 --- /dev/null +++ b/math/aarch64/cospif_2u6.c @@ -0,0 +1,93 @@ +/* + * Single-precision scalar cospi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Taylor series coefficents for sin(pi * x). */ +#define C0 0x1.921fb6p1f +#define C1 -0x1.4abbcep2f +#define C2 0x1.466bc6p1f +#define C3 -0x1.32d2ccp-1f +#define C4 0x1.50783p-4f +#define C5 -0x1.e30750p-8f + +#define Shift 0x1.0p+23f + +/* Approximation for scalar single-precision cospi(x) - cospif. + Maximum error: 2.64 ULP: + cospif(0x1.37e844p-4) got 0x1.f16b3p-1 + want 0x1.f16b2ap-1. */ +float +arm_math_cospif (float x) +{ + if (isinf (x) || isnan (x)) + return __math_invalidf (x); + + float ax = asfloat (asuint (x) & ~0x80000000); + + /* Edge cases for when cospif should be exactly +/- 1. (Integers) + 0x1p23 is the limit for single precision to store any decimal places. */ + if (ax >= 0x1p24f) + return 1; + + uint32_t m = roundf (ax); + if (m == ax) + return (m & 1) ? -1 : 1; + + /* Any non-integer values >= 0x1p22f will be int +0.5. + These values should return exactly 0. */ + if (ax >= 0x1p22f) + return 0; + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via cospi(x) ~= 1 - + (pi*x). */ + if (ax < 0x1p-31f) + return 1 - (C0 * x); + + /* n = rint(|x|). */ + float n = ax + Shift; + uint32_t sign = asuint (n) << 31; + n = n - Shift; + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + float r = 0.5f - fabs (ax - n); + + /* y = sin(pi * r). */ + float r2 = r * r; + float y = fmaf (C5, r2, C4); + y = fmaf (y, r2, C3); + y = fmaf (y, r2, C2); + y = fmaf (y, r2, C1); + y = fmaf (y, r2, C0); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be + positive, therefore, the sign must be introduced based upon if x rounds to + odd or even. */ + return asfloat (asuint (y * r) ^ sign); +} + +#if WANT_EXPERIMENTAL_MATH +float +cospif (float x) +{ + return arm_math_cospif (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_cospif, 2.15) +TEST_SYM_INTERVAL (arm_math_cospif, 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (arm_math_cospif, 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_cospif, 0.5, 0x1p22f, 10000) +TEST_SYM_INTERVAL (arm_math_cospif, 0x1p22f, inf, 10000) +#endif diff --git a/math/aarch64/experimental/README.contributors b/math/aarch64/experimental/README.contributors new file mode 100644 index 000000000000..abb749485ba3 --- /dev/null +++ b/math/aarch64/experimental/README.contributors @@ -0,0 +1,16 @@ +Code in this sub-directory should follow the GNU Coding Standard, but it is +not expected to be upstreamed into glibc without modification, so +glibc-specific conventions need not be followed. + +The requirements for portable code apply to non-portable code with the +following differences: + +1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There + are no specific restrictions on acceptable ULP error, but if functions + provide significantly less accuracy than portable equivalents then a clear + justification for inclusion should be stated in comments at the top of the + source file. Error bounds of the approximation should be clearly documented + in comments. + +2. Functions are assumed to support round-to-nearest mode by default, unless + stated; other rounding modes are not required to be provided. diff --git a/math/aarch64/experimental/acos_2u.c b/math/aarch64/experimental/acos_2u.c new file mode 100644 index 000000000000..062215c92248 --- /dev/null +++ b/math/aarch64/experimental/acos_2u.c @@ -0,0 +1,100 @@ +/* + * Double-precision acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 +#define PiOver2 0x1.921fb54442d18p+0 +#define Pi 0x1.921fb54442d18p+1 +#define Small 0x3c90000000000000 /* 2^-53. */ +#define Small16 0x3c90 +#define QNaN 0x7ff8 + +/* Fast implementation of double-precision acos(x) based on polynomial + approximation of double-precision asin(x). + + For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + + For |x| in [Small, 0.5], use the trigonometric identity + + acos(x) = pi/2 - asin(x) + + and use an order 11 polynomial P such that the final approximation of asin + is an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + + The largest observed error in this region is 1.18 ulps, + acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1 + + acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)) + + where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the + approximation of asin near 0. + + The largest observed error in this region is 1.52 ulps, + acos(0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. + + For x in [-1.0, -0.5], use this other identity to deduce the negative inputs + from their absolute value: acos(x) = pi - acos(-x). */ +double +acos (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t ia16 = ia >> 48; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia16 == QNaN)) + return x; + if (ia > One) + return __math_invalid (x); + if (ia16 < Small16) + return PiOver2 - x; + + /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5); + double z = ax < 0.5 ? ax : sqrt (z2); + + /* Use a single polynomial approximation P for both intervals. */ + double z4 = z2 * z2; + double z8 = z4 * z4; + double z16 = z8 * z8; + double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fma (z * z2, p, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = pi - 2 Q(|x|), for -1.0 < x <= -0.5 + = 2 Q(|x|) , for -0.5 < x < 0.0. */ + if (ax < 0.5) + return PiOver2 - asdouble (asuint64 (p) | sign); + + return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p; +} + +TEST_SIG (S, D, 1, acos, -1.0, 1.0) +TEST_ULP (acos, 1.02) +TEST_INTERVAL (acos, 0, Small, 5000) +TEST_INTERVAL (acos, Small, 0.5, 50000) +TEST_INTERVAL (acos, 0.5, 1.0, 50000) +TEST_INTERVAL (acos, 1.0, 0x1p11, 50000) +TEST_INTERVAL (acos, 0x1p11, inf, 20000) +TEST_INTERVAL (acos, -0, -inf, 20000) diff --git a/math/aarch64/experimental/acosf_1u4.c b/math/aarch64/experimental/acosf_1u4.c new file mode 100644 index 000000000000..d207f5e89f26 --- /dev/null +++ b/math/aarch64/experimental/acosf_1u4.c @@ -0,0 +1,99 @@ +/* + * Single-precision acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define PiOver2f 0x1.921fb6p+0f +#define Pif 0x1.921fb6p+1f +#define Small 0x32800000 /* 2^-26. */ +#define Small12 0x328 +#define QNaN 0x7fc + +/* Fast implementation of single-precision acos(x) based on polynomial + approximation of single-precision asin(x). + + For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + + For |x| in [Small, 0.5], use the trigonometric identity + + acos(x) = pi/2 - asin(x) + + and use an order 4 polynomial P such that the final approximation of asin is + an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + + The largest observed error in this region is 1.16 ulps, + acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 want 0x1.0c27f6p+0. + + For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1 + + acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)) + + where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the + approximation of asin near 0. + + The largest observed error in this region is 1.32 ulps, + acosf(0x1.15ba56p-1) got 0x1.feb33p-1 want 0x1.feb32ep-1. + + For x in [-1.0, -0.5], use this other identity to deduce the negative inputs + from their absolute value. + + acos(x) = pi - acos(-x) + + The largest observed error in this region is 1.28 ulps, + acosf(-0x1.002072p-1) got 0x1.0c1e84p+1 want 0x1.0c1e82p+1. */ +float +acosf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia12 == QNaN)) + return x; + if (ia > One) + return __math_invalidf (x); + if (ia12 < Small12) + return PiOver2f - x; + + /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f); + float z = ax < 0.5 ? ax : sqrtf (z2); + + /* Use a single polynomial approximation P for both intervals. */ + float p = horner_4_f32 (z2, __asinf_poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fmaf (z * z2, p, z); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = pi - 2 Q(|x|), for -1.0 < x <= -0.5 + = 2 Q(|x|) , for -0.5 < x < 0.0. */ + if (ax < 0.5) + return PiOver2f - asfloat (asuint (p) | sign); + + return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p; +} + +TEST_SIG (S, F, 1, acos, -1.0, 1.0) +TEST_ULP (acosf, 0.82) +TEST_INTERVAL (acosf, 0, Small, 5000) +TEST_INTERVAL (acosf, Small, 0.5, 50000) +TEST_INTERVAL (acosf, 0.5, 1.0, 50000) +TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000) +TEST_INTERVAL (acosf, 0x1p11, inf, 20000) +TEST_INTERVAL (acosf, -0, -inf, 20000) diff --git a/math/aarch64/experimental/acosh_3u.c b/math/aarch64/experimental/acosh_3u.c new file mode 100644 index 000000000000..19da82f4f3e5 --- /dev/null +++ b/math/aarch64/experimental/acosh_3u.c @@ -0,0 +1,61 @@ +/* + * Double-precision acosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Ln2 (0x1.62e42fefa39efp-1) +#define MinusZero (0x8000000000000000) +#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511). */ +#define Two (0x4000000000000000) /* asuint64(2.0). */ + +/* acosh approximation using a variety of approaches on different intervals: + + acosh(x) = ln(x + sqrt(x * x - 1)). + + x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) + is close enough to x that we can calculate the result by ln(2x) == ln(x) + + ln(2). The greatest observed error in this region is 0.98 ULP: + acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9 + want 0x1.28066a11a7c8p+9. + + x > 2: Calculate the result directly using definition of acosh(x). Greatest + observed error in this region is 1.33 ULP: + acosh(0x1.1e45d14bfcfa2p+1) got 0x1.71a06f50c34b5p+0 + want 0x1.71a06f50c34b6p+0. + + 0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is + undefined. For 1 <= x <= 2, the largest observed error is 2.69 ULP: + acosh(0x1.073528248093p+0) got 0x1.e4d9bd20684f3p-3 + want 0x1.e4d9bd20684f6p-3. */ +double +acosh (double x) +{ + uint64_t ix = asuint64 (x); + + if (unlikely (ix >= MinusZero)) + return __math_invalid (x); + + if (unlikely (ix >= SquareLim)) + return log (x) + Ln2; + + if (ix >= Two) + return log (x + sqrt (x * x - 1)); + + double xm1 = x - 1; + return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1)); +} + +TEST_SIG (S, D, 1, acosh, 1.0, 10.0) +TEST_ULP (acosh, 2.19) +TEST_INTERVAL (acosh, 0, 1, 10000) +TEST_INTERVAL (acosh, 1, 2, 100000) +TEST_INTERVAL (acosh, 2, 0x1p511, 100000) +TEST_INTERVAL (acosh, 0x1p511, inf, 100000) +TEST_INTERVAL (acosh, -0, -inf, 10000) diff --git a/math/aarch64/experimental/acoshf_2u8.c b/math/aarch64/experimental/acoshf_2u8.c new file mode 100644 index 000000000000..a46b310ee312 --- /dev/null +++ b/math/aarch64/experimental/acoshf_2u8.c @@ -0,0 +1,55 @@ +/* + * Single-precision acosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Ln2 (0x1.62e4p-1f) +#define MinusZero 0x80000000 +#define SquareLim 0x5f800000 /* asuint(0x1p64). */ +#define Two 0x40000000 + +/* acoshf approximation using a variety of approaches on different intervals: + + x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is + close enough to x that we can calculate the result by ln(2x) == ln(x) + + ln(2). The greatest error in the region is 0.94 ULP: + acoshf(0x1.15f706p+92) got 0x1.022e14p+6 want 0x1.022e16p+6. + + x > 2: Calculate the result directly using definition of asinh(x) = ln(x + + sqrt(x*x - 1)). Greatest error in this region is 1.30 ULP: + acoshf(0x1.249d8p+1) got 0x1.77e1aep+0 want 0x1.77e1bp+0. + + 0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is + undefined. For 1 <= x <= 2, the greatest error is 2.78 ULP: + acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 want 0x1.ef9ea2p-3. */ +float +acoshf (float x) +{ + uint32_t ix = asuint (x); + + if (unlikely (ix >= MinusZero)) + return __math_invalidf (x); + + if (unlikely (ix >= SquareLim)) + return logf (x) + Ln2; + + if (ix > Two) + return logf (x + sqrtf (x * x - 1)); + + float xm1 = x - 1; + return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1)); +} + +TEST_SIG (S, F, 1, acosh, 1.0, 10.0) +TEST_ULP (acoshf, 2.30) +TEST_INTERVAL (acoshf, 0, 1, 100) +TEST_INTERVAL (acoshf, 1, 2, 10000) +TEST_INTERVAL (acoshf, 2, 0x1p64, 100000) +TEST_INTERVAL (acoshf, 0x1p64, inf, 100000) +TEST_INTERVAL (acoshf, -0, -inf, 10000) diff --git a/math/aarch64/experimental/advsimd/erfinv_25u.c b/math/aarch64/experimental/advsimd/erfinv_25u.c new file mode 100644 index 000000000000..2fa2f0beb8b7 --- /dev/null +++ b/math/aarch64/experimental/advsimd/erfinv_25u.c @@ -0,0 +1,166 @@ +/* + * Double-precision inverse error function (AdvSIMD variant). + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "v_poly_f64.h" +#define V_LOG_INLINE_POLY_ORDER 4 +#include "v_log_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. P is interleaved P_17 and P_37, similar for Q. P17 + and Q17 are provided as homogenous vectors as well for when the shortcut + can be taken. */ + double P[8][2], Q[7][2]; + float64x2_t tailshift; + uint8_t idx[16]; + struct v_log_inline_data log_tbl; + float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6]; +} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 }, + { -0x1.6b23cc5c6c6d7p+6, 0x1.60b8fe375999ep-2 }, + { 0x1.74e5f6ceb3548p+7, -0x1.779bb9bef7c0fp+1 }, + { -0x1.5200bb15cc6bbp+7, 0x1.786ea384470a2p+3 }, + { 0x1.05d193233a849p+6, -0x1.6a7c1453c85d3p+4 }, + { -0x1.148c5474ee5e1p+3, 0x1.31f0fc5613142p+4 }, + { 0x1.689181bbafd0cp-3, -0x1.5ea6c007d4dbbp+2 }, + { 0, 0x1.e66f265ce9e5p-3 } }, + .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 }, + { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 }, + { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 }, + { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 }, + { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 }, + { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 }, + { 0x1p+0, -0x1.4075c56404eecp+3 } }, + .P_57 = { V2 (0x1.b874f9516f7f1p-14), V2 (0x1.5921f2916c1c4p-7), + V2 (0x1.145ae7d5b8fa4p-2), V2 (0x1.29d6dcc3b2fb7p+1), + V2 (0x1.cabe2209a7985p+2), V2 (0x1.11859f0745c4p+3), + V2 (0x1.b7ec7bc6a2ce5p+2), V2 (0x1.d0419e0bb42aep+1), + V2 (0x1.c5aa03eef7258p-1) }, + .Q_57 = { V2 (0x1.b8747e12691f1p-14), V2 (0x1.59240d8ed1e0ap-7), + V2 (0x1.14aef2b181e2p-2), V2 (0x1.2cd181bcea52p+1), + V2 (0x1.e6e63e0b7aa4cp+2), V2 (0x1.65cf8da94aa3ap+3), + V2 (0x1.7e5c787b10a36p+3), V2 (0x1.0626d68b6cea3p+3), + V2 (0x1.065c5f193abf6p+2), V2 (0x1p+0) }, + .P_17 = { V2 (0x1.007ce8f01b2e8p+4), V2 (-0x1.6b23cc5c6c6d7p+6), + V2 (0x1.74e5f6ceb3548p+7), V2 (-0x1.5200bb15cc6bbp+7), + V2 (0x1.05d193233a849p+6), V2 (-0x1.148c5474ee5e1p+3), + V2 (0x1.689181bbafd0cp-3) }, + .Q_17 = { V2 (0x1.d8fb0f913bd7bp+3), V2 (-0x1.6d7f25a3f1c24p+6), + V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7), + V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) }, + .tailshift = V2 (-0.87890625), + .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, + .log_tbl = V_LOG_CONSTANTS }; + +static inline float64x2_t +special (float64x2_t x, const struct data *d) +{ + /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1p_inline helper on -abs(x) - note that erfinv(inf) + will still be finite. */ + float64x2_t t = vnegq_f64 ( + v_log_inline (vsubq_f64 (v_f64 (1), vabsq_f64 (x)), &d->log_tbl)); + t = vdivq_f64 (v_f64 (1), vsqrtq_f64 (t)); + float64x2_t ts = vbslq_f64 (v_u64 (0x7fffffffffffffff), t, x); + return vdivq_f64 (v_horner_8_f64 (t, d->P_57), + vmulq_f64 (ts, v_horner_9_f64 (t, d->Q_57))); +} + +static inline float64x2_t +lookup (const double *c, uint8x16_t idx) +{ + float64x2_t x = vld1q_f64 (c); + return vreinterpretq_f64_u8 (vqtbl1q_u8 (vreinterpretq_u8_f64 (x), idx)); +} + +static inline float64x2_t VPCS_ATTR +notails (float64x2_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + float64x2_t t = vfmaq_f64 (v_f64 (-0.5625), x, x); + float64x2_t p = vmulq_f64 (v_horner_6_f64 (t, d->P_17), x); + float64x2_t q = vaddq_f64 (d->Q_17[5], t); + for (int i = 4; i >= 0; i--) + q = vfmaq_f64 (d->Q_17[i], q, t); + return vdivq_f64 (p, q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Largest observed error is 24.75 ULP: + _ZGVnN2v_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0 + want 0x1.ea0547268660cp+0. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + uint64x2_t is_tail = vcagtq_f64 (x, v_f64 (0.75)); + + if (unlikely (!v_any_u64 (is_tail))) + /* If input is normally distributed in [-1, 1] then likelihood of this is + 0.75^2 ~= 0.56. */ + return notails (x, d); + + uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375)); + + uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8)); + uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off); + + float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625)); + t = vfmaq_f64 (t, x, x); + + float64x2_t p = lookup (&d->P[7][0], idx); + /* Last coeff of q is either 0 or 1 - use mask instead of load. */ + float64x2_t q = vreinterpretq_f64_u64 ( + vandq_u64 (is_tail, vreinterpretq_u64_f64 (v_f64 (1)))); + for (int i = 6; i >= 0; i--) + { + p = vfmaq_f64 (lookup (&d->P[i][0], idx), p, t); + q = vfmaq_f64 (lookup (&d->Q[i][0], idx), q, t); + } + p = vmulq_f64 (p, x); + + if (unlikely (v_any_u64 (extreme_tail))) + return vbslq_f64 (extreme_tail, special (x, d), vdivq_f64 (p, q)); + + return vdivq_f64 (p, q); +} + +#if USE_MPFR +# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference +#else +TEST_SIG (V, D, 1, erfinv, -0.99, 0.99) +TEST_ULP (V_NAME_D1 (erfinv), 24.8) +TEST_DISABLE_FENV (V_NAME_D1 (erfinv)) +TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000) +/* Test with control lane in each interval. */ +TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5) +TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8) +TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95) +#endif diff --git a/math/aarch64/experimental/advsimd/erfinvf_5u.c b/math/aarch64/experimental/advsimd/erfinvf_5u.c new file mode 100644 index 000000000000..254d50feb289 --- /dev/null +++ b/math/aarch64/experimental/advsimd/erfinvf_5u.c @@ -0,0 +1,172 @@ +/* + * Single-precision inverse error function (AdvSIMD variant). + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" +#include "v_logf_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. Coefficients are stored in various interleaved + formats to allow for table-based (vector-to-vector) lookup. + + Plo is first two coefficients of P_10 and P_29 interleaved. + PQ is third coeff of P_10 and first of Q_29 interleaved. + Qhi is second and third coeffs of Q_29 interleaved. + P29_3 is a homogenous vector with fourth coeff of P_29. + + P_10 and Q_10 are also stored in homogenous vectors to allow better + memory access when no lanes are in a tail region. */ + float Plo[4], PQ[4], Qhi[4]; + float32x4_t P29_3, tailshift; + float32x4_t P_50[6], Q_50[2]; + float32x4_t P_10[3], Q_10[3]; + uint8_t idxhi[16], idxlo[16]; + struct v_logf_data logf_tbl; +} data = { + .idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }, + .idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }, + .P29_3 = V4 (0x1.b13626p-2), + .tailshift = V4 (-0.87890625), + .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 }, + .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 }, + .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 }, + .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1), + V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) }, + .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) }, + .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) }, + .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) }, + .logf_tbl = V_LOGF_CONSTANTS +}; + +static inline float32x4_t +special (float32x4_t x, const struct data *d) +{ + /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf) + will still be finite. */ + float32x4_t t = vdivq_f32 ( + v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline ( + vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl)))); + float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x); + float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t); + return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q)); +} + +static inline float32x4_t +notails (float32x4_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x); + float32x4_t q = vaddq_f32 (t, d->Q_10[2]); + q = vfmaq_f32 (d->Q_10[1], t, q); + q = vfmaq_f32 (d->Q_10[0], t, q); + + return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q); +} + +static inline float32x4_t +lookup (float32x4_t tbl, uint8x16_t idx) +{ + return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx)); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Worst-case error is 4.98 ULP, in the + tail region: + _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0 + want 0x1.b4793ap+0 . */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error + function", Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75)); + uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375)); + + if (unlikely (!v_any_u32 (is_tail))) + /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to + gather coefficients. If input is uniform in [-1, 1] then likelihood of + this is 0.75^4 ~= 0.31. */ + return notails (x, d); + + /* Select requisite shift depending on interval: polynomial is evaluated on + x * x - shift. + Normal shift = 0.5625 + Tail shift = 0.87890625. */ + float32x4_t t + = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x); + + /* Calculate indexes for tbl: tbl is byte-wise, so: + [0, 1, 2, 3, 4, 5, 6, ....] copies the vector + Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores + two pairs of coeffs, so we need two idx vectors - one for each pair. */ + uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4)); + uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off); + uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off); + + /* Load the tables. */ + float32x4_t plo = vld1q_f32 (d->Plo); + float32x4_t pq = vld1q_f32 (d->PQ); + float32x4_t qhi = vld1q_f32 (d->Qhi); + + /* Do the lookup (and calculate p3 by masking non-tail lanes). */ + float32x4_t p3 = vreinterpretq_f32_u32 ( + vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3))); + float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi), + p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi), + q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi); + + float32x4_t p = vfmaq_f32 (p2, p3, t); + p = vfmaq_f32 (p1, p, t); + p = vfmaq_f32 (p0, p, t); + p = vmulq_f32 (x, p); + + float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t); + q = vfmaq_f32 (q0, q, t); + + if (unlikely (v_any_u32 (extreme_tail))) + /* At least one lane is in the extreme tail - if input is uniform in + [-1, 1] the likelihood of this is ~0.23. */ + return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q)); + + return vdivq_f32 (p, q); +} + +HALF_WIDTH_ALIAS_F1 (erfinv) + +#if USE_MPFR +# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference +#else +TEST_SIG (V, F, 1, erfinv, -0.99, 0.99) +TEST_DISABLE_FENV (V_NAME_F1 (erfinv)) +TEST_ULP (V_NAME_F1 (erfinv), 4.49) +TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000) +/* Test with control lane in each interval. */ +TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5) +TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8) +TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95) +#endif diff --git a/math/aarch64/experimental/advsimd/v_logf_inline.h b/math/aarch64/experimental/advsimd/v_logf_inline.h new file mode 100644 index 000000000000..3f4534173289 --- /dev/null +++ b/math/aarch64/experimental/advsimd/v_logf_inline.h @@ -0,0 +1,59 @@ +/* + * Single-precision vector log function - inline version + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +struct v_logf_data +{ + float32x4_t poly[7]; + float32x4_t ln2; + uint32x4_t off, mantissa_mask; +}; + +#define V_LOGF_CONSTANTS \ + { \ + .poly \ + = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), \ + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), \ + V4 (-0x1.ffffc8p-2f) }, \ + .ln2 = V4 (0x1.62e43p-1f), .off = V4 (0x3f2aaaab), \ + .mantissa_mask = V4 (0x007fffff) \ + } + +#define P(i) d->poly[7 - i] + +static inline float32x4_t +v_logf_inline (float32x4_t x, const struct v_logf_data *d) +{ + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + + u = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + return vfmaq_f32 (p, y, r2); +} + +#undef P diff --git a/math/aarch64/experimental/asin_3u.c b/math/aarch64/experimental/asin_3u.c new file mode 100644 index 000000000000..56e63e451ba1 --- /dev/null +++ b/math/aarch64/experimental/asin_3u.c @@ -0,0 +1,106 @@ +/* + * Double-precision asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f64.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 +#define PiOver2 0x1.921fb54442d18p+0 +#define Small 0x3e50000000000000 /* 2^-26. */ +#define Small16 0x3e50 +#define QNaN 0x7ff8 + +/* Fast implementation of double-precision asin(x) based on polynomial + approximation. + + For x < Small, approximate asin(x) by x. Small = 2^-26 for correct rounding. + + For x in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + asin(0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + No cheap approximation can be obtained near x = 1, since the function is not + continuously differentiable on 1. + + For x in [0.5, 1.0], we use a method based on a trigonometric identity + + asin(x) = pi/2 - acos(x) + + and a generalized power series expansion of acos(y) near y=1, that reads as + + acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1) + + The Taylor series of asin(z) near z = 0, reads as + + asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...). + + Therefore, (1) can be written in terms of P(y/2) or even asin(y/2) + + acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2) + + Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and + + asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). + + The largest observed error in this region is 2.69 ulps, + asin(0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ +double +asin (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t ia16 = ia >> 48; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia16 == QNaN)) + return x; + if (ia > One) + return __math_invalid (x); + if (ia16 < Small16) + return x; + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5); + double z = ax < 0.5 ? ax : sqrt (z2); + + /* Use a single polynomial approximation P for both intervals. */ + double z4 = z2 * z2; + double z8 = z4 * z4; + double z16 = z8 * z8; + double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fma (z * z2, p, z); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + double y = ax < 0.5 ? p : fma (-2.0, p, PiOver2); + + /* Copy sign. */ + return asdouble (asuint64 (y) | sign); +} + +TEST_SIG (S, D, 1, asin, -1.0, 1.0) +TEST_ULP (asin, 2.20) +TEST_INTERVAL (asin, 0, Small, 5000) +TEST_INTERVAL (asin, Small, 0.5, 50000) +TEST_INTERVAL (asin, 0.5, 1.0, 50000) +TEST_INTERVAL (asin, 1.0, 0x1p11, 50000) +TEST_INTERVAL (asin, 0x1p11, inf, 20000) +TEST_INTERVAL (asin, -0, -inf, 20000) diff --git a/math/aarch64/experimental/asin_data.c b/math/aarch64/experimental/asin_data.c new file mode 100644 index 000000000000..60ab476e7ec9 --- /dev/null +++ b/math/aarch64/experimental/asin_data.c @@ -0,0 +1,19 @@ +/* + * Coefficients for single-precision asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asin(x) directly in [0x1p-106, 0.25]. See tools/asin.sollya + for these coeffcients were generated. */ +const double __asin_poly[] = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, +}; diff --git a/math/aarch64/experimental/asinf_2u5.c b/math/aarch64/experimental/asinf_2u5.c new file mode 100644 index 000000000000..1136da01550e --- /dev/null +++ b/math/aarch64/experimental/asinf_2u5.c @@ -0,0 +1,100 @@ +/* + * Single-precision asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define PiOver2f 0x1.921fb6p+0f +#define Small 0x39800000 /* 2^-12. */ +#define Small12 0x398 +#define QNaN 0x7fc + +/* Fast implementation of single-precision asin(x) based on polynomial + approximation. + + For x < Small, approximate asin(x) by x. Small = 2^-12 for correct rounding. + + For x in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + asinf(0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + No cheap approximation can be obtained near x = 1, since the function is not + continuously differentiable on 1. + + For x in [0.5, 1.0], we use a method based on a trigonometric identity + + asin(x) = pi/2 - acos(x) + + and a generalized power series expansion of acos(y) near y=1, that reads as + + acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1) + + The Taylor series of asin(z) near z = 0, reads as + + asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...). + + Therefore, (1) can be written in terms of P(y/2) or even asin(y/2) + + acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2) + + Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and + + asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). + + The largest observed error in this region is 2.41 ulps, + asinf(0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float +asinf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + /* Special values and invalid range. */ + if (unlikely (ia12 == QNaN)) + return x; + if (ia > One) + return __math_invalidf (x); + if (ia12 < Small12) + return x; + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f); + float z = ax < 0.5 ? ax : sqrtf (z2); + + /* Use a single polynomial approximation P for both intervals. */ + float p = horner_4_f32 (z2, __asinf_poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = fmaf (z * z2, p, z); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float y = ax < 0.5 ? p : fmaf (-2.0f, p, PiOver2f); + + /* Copy sign. */ + return asfloat (asuint (y) | sign); +} + +TEST_SIG (S, F, 1, asin, -1.0, 1.0) +TEST_ULP (asinf, 1.91) +TEST_INTERVAL (asinf, 0, Small, 5000) +TEST_INTERVAL (asinf, Small, 0.5, 50000) +TEST_INTERVAL (asinf, 0.5, 1.0, 50000) +TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000) +TEST_INTERVAL (asinf, 0x1p11, inf, 20000) +TEST_INTERVAL (asinf, -0, -inf, 20000) diff --git a/math/aarch64/experimental/asinf_data.c b/math/aarch64/experimental/asinf_data.c new file mode 100644 index 000000000000..15f331dde5a7 --- /dev/null +++ b/math/aarch64/experimental/asinf_data.c @@ -0,0 +1,16 @@ +/* + * Coefficients for single-precision asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asinf(x) directly in [0x1p-24, 0.25]. See for tools/asinf.sollya + for these coeffs were generated. */ +const float __asinf_poly[] = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, 0x1.3af7d8p-5, +}; diff --git a/math/aarch64/experimental/asinh_2u5.c b/math/aarch64/experimental/asinh_2u5.c new file mode 100644 index 000000000000..9d2d160a1453 --- /dev/null +++ b/math/aarch64/experimental/asinh_2u5.c @@ -0,0 +1,82 @@ +/* + * Double-precision asinh(x) function + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "poly_scalar_f64.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffffffffffff +#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */ +#define One 0x3ff0000000000000 /* asuint64(1.0). */ +#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */ +#define Ln2 0x1.62e42fefa39efp-1 + +/* Scalar double-precision asinh implementation. This routine uses different + approaches on different intervals: + + |x| < 2^-26: Return x. Function is exact in this region. + + |x| < 1: Use custom order-17 polynomial. This is least accurate close to 1. + The largest observed error in this region is 1.47 ULPs: + asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. + + |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate + the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)). + The largest observed error in this region is 2.03 ULPs: + asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1 + want -0x1.c3508eb6a682p-1. + + |x| >= 2^511: We cannot square x without overflow at a low + cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot + even double x without overflow, so calculate this as ln(x) + + ln(2). The largest observed error in this region is 0.98 ULPs at many + values, for instance: + asinh(0x1.5255a4cf10319p+975) got 0x1.52652f4cb26cbp+9 + want 0x1.52652f4cb26ccp+9. */ +double +asinh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + if (ia < ExpM26) + { + return x; + } + + if (ia < One) + { + double x2 = x * x; + double z2 = x2 * x2; + double z4 = z2 * z2; + double z8 = z4 * z4; + double p = estrin_17_f64 (x2, z2, z4, z8, z8 * z8, __asinh_data.poly); + double y = fma (p, x2 * ax, ax); + return asdouble (asuint64 (y) | sign); + } + + if (unlikely (ia >= Exp511)) + { + return asdouble (asuint64 (log (ax) + Ln2) | sign); + } + + return asdouble (asuint64 (log (ax + sqrt (ax * ax + 1))) | sign); +} + +TEST_SIG (S, D, 1, asinh, -10.0, 10.0) +TEST_ULP (asinh, 1.54) +TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000) +TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000) +TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000) +TEST_INTERVAL (asinh, 1.0, 100.0, 40000) +TEST_INTERVAL (asinh, -1.0, -100.0, 10000) +TEST_INTERVAL (asinh, 100.0, inf, 50000) +TEST_INTERVAL (asinh, -100.0, -inf, 10000) diff --git a/math/aarch64/experimental/asinh_data.c b/math/aarch64/experimental/asinh_data.c new file mode 100644 index 000000000000..7afaf6960130 --- /dev/null +++ b/math/aarch64/experimental/asinh_data.c @@ -0,0 +1,23 @@ +/* + * Double-precision polynomial coefficients for scalar asinh(x) + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* asinh(x) is odd, and the first term of the Taylor expansion is x, so we can + approximate the function by x + x^3 * P(x^2), where P(z) has the form: + C0 + C1 * z + C2 * z^2 + C3 * z^3 + ... + Note P is evaluated on even powers of x only. See tools/asinh.sollya for the + algorithm used to generate these coefficients. */ +const struct asinh_data __asinh_data + = { .poly + = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, + 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, + -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, + 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, + -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, + 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, + 0x1.93d4ba83d34dap-18 } }; diff --git a/math/aarch64/experimental/asinhf_3u5.c b/math/aarch64/experimental/asinhf_3u5.c new file mode 100644 index 000000000000..92c6dfd9b43d --- /dev/null +++ b/math/aarch64/experimental/asinhf_3u5.c @@ -0,0 +1,73 @@ +/* + * Single-precision asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask (0x7fffffff) +#define SqrtFltMax (0x1.749e96p+10f) +#define Ln2 (0x1.62e4p-1f) +#define One (0x3f8) +#define ExpM12 (0x398) + +/* asinhf approximation using a variety of approaches on different intervals: + + |x| < 2^-12: Return x. Function is exactly rounded in this region. + + |x| < 1.0: Use custom order-8 polynomial. The largest observed + error in this region is 1.3ulps: + asinhf(0x1.f0f74cp-1) got 0x1.b88de4p-1 want 0x1.b88de2p-1. + + |x| <= SqrtFltMax: Calculate the result directly using the + definition of asinh(x) = ln(x + sqrt(x*x + 1)). The largest + observed error in this region is 1.99ulps. + asinhf(0x1.00e358p+0) got 0x1.c4849ep-1 want 0x1.c484a2p-1. + + |x| > SqrtFltMax: We cannot square x without overflow at a low + cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot + even double x without overflow, so calculate this as ln(x) + + ln(2). This largest observed error in this region is 3.39ulps. + asinhf(0x1.749e9ep+10) got 0x1.fffff8p+2 want 0x1.fffffep+2. */ +float +asinhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + if (unlikely (ia12 < ExpM12 || ia == 0x7f800000)) + return x; + + if (unlikely (ia12 >= 0x7f8)) + return __math_invalidf (x); + + if (ia12 < One) + { + float x2 = ax * ax; + float p = estrin_7_f32 (ax, x2, x2 * x2, __asinhf_data.coeffs); + float y = fmaf (x2, p, ax); + return asfloat (asuint (y) | sign); + } + + if (unlikely (ax > SqrtFltMax)) + { + return asfloat (asuint (logf (ax) + Ln2) | sign); + } + + return asfloat (asuint (logf (ax + sqrtf (ax * ax + 1))) | sign); +} + +TEST_SIG (S, F, 1, asinh, -10.0, 10.0) +TEST_ULP (asinhf, 2.9) +TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000) +TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000) +TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000) +TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000) diff --git a/math/aarch64/experimental/asinhf_data.c b/math/aarch64/experimental/asinhf_data.c new file mode 100644 index 000000000000..5ed261ba835b --- /dev/null +++ b/math/aarch64/experimental/asinhf_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients for single-precision asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya + for these coeffs were generated. */ +const struct asinhf_data __asinhf_data + = { .coeffs = { -0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, + 0x1.3a81dcp-4f, 0x1.65bbaap-10f, -0x1.057f1p-4f, + 0x1.6c1d46p-5f, -0x1.4cafe8p-7f } }; diff --git a/math/aarch64/experimental/atan2_2u5.c b/math/aarch64/experimental/atan2_2u5.c new file mode 100644 index 000000000000..518e34589e5b --- /dev/null +++ b/math/aarch64/experimental/atan2_2u5.c @@ -0,0 +1,159 @@ +/* + * Double-precision scalar atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <stdbool.h> + +#include "atan_common.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Pi (0x1.921fb54442d18p+1) +#define PiOver2 (0x1.921fb54442d18p+0) +#define PiOver4 (0x1.921fb54442d18p-1) +#define SignMask (0x8000000000000000) +#define ExpMask (0x7ff0000000000000) + +/* We calculate atan2 by P(n/d), where n and d are similar to the input + arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8, + which may underflow if n and d have very different magnitude. + POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n + and d for which P underflows, and is used to special-case such inputs. */ +#define POW8_EXP_UFLOW_BOUND 62 + +static inline int64_t +biased_exponent (double f) +{ + uint64_t fi = asuint64 (f); + return (fi & ExpMask) >> 52; +} + +/* Fast implementation of scalar atan2. Largest errors are when y and x are + close together. The greatest observed error is 2.28 ULP: + atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) + got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ +double +atan2 (double y, double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iy = asuint64 (y); + + uint64_t sign_x = ix & SignMask; + uint64_t sign_y = iy & SignMask; + + uint64_t iax = ix & ~SignMask; + uint64_t iay = iy & ~SignMask; + + bool xisnan = isnan (x); + if (unlikely (isnan (y) && !xisnan)) + return __math_invalid (y); + if (unlikely (xisnan)) + return __math_invalid (x); + + /* m = 2 * sign(x) + sign(y). */ + uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2); + + int64_t exp_diff = biased_exponent (x) - biased_exponent (y); + + /* y = 0. */ + if (iay == 0) + { + switch (m) + { + case 0: + case 1: + return y; /* atan(+-0,+anything)=+-0. */ + case 2: + return Pi; /* atan(+0,-anything) = pi. */ + case 3: + return -Pi; /* atan(-0,-anything) =-pi. */ + } + } + /* Special case for (x, y) either on or very close to the y axis. Either x = + 0, or y is much larger than x (difference in exponents >= + POW8_EXP_UFLOW_BOUND). */ + if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND)) + return sign_y ? -PiOver2 : PiOver2; + + /* Special case for either x is INF or (x, y) is very close to x axis and x + is negative. */ + if (unlikely (iax == 0x7ff0000000000000 + || (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2))) + { + if (iay == 0x7ff0000000000000) + { + switch (m) + { + case 0: + return PiOver4; /* atan(+INF,+INF). */ + case 1: + return -PiOver4; /* atan(-INF,+INF). */ + case 2: + return 3.0 * PiOver4; /* atan(+INF,-INF). */ + case 3: + return -3.0 * PiOver4; /* atan(-INF,-INF). */ + } + } + else + { + switch (m) + { + case 0: + return 0.0; /* atan(+...,+INF). */ + case 1: + return -0.0; /* atan(-...,+INF). */ + case 2: + return Pi; /* atan(+...,-INF). */ + case 3: + return -Pi; /* atan(-...,-INF). */ + } + } + } + /* y is INF. */ + if (iay == 0x7ff0000000000000) + return sign_y ? -PiOver2 : PiOver2; + + uint64_t sign_xy = sign_x ^ sign_y; + + double ax = asdouble (iax); + double ay = asdouble (iay); + uint64_t pred_aygtax = (ay > ax); + + /* Set up z for call to atan. */ + double n = pred_aygtax ? -ax : ay; + double d = pred_aygtax ? ay : ax; + double z = n / d; + + double ret; + if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND)) + { + /* If (x, y) is very close to x axis and x is positive, the polynomial + will underflow and evaluate to z. */ + ret = z; + } + else + { + /* Work out the correct shift. */ + double shift = sign_x ? -2.0 : 0.0; + shift = pred_aygtax ? shift + 1.0 : shift; + shift *= PiOver2; + + ret = eval_poly (z, z, shift); + } + + /* Account for the sign of x and y. */ + return asdouble (asuint64 (ret) ^ sign_xy); +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (S, D, 2, atan2) +TEST_ULP (atan2, 1.78) +TEST_INTERVAL (atan2, -10.0, 10.0, 50000) +TEST_INTERVAL (atan2, -1.0, 1.0, 40000) +TEST_INTERVAL (atan2, 0.0, 1.0, 40000) +TEST_INTERVAL (atan2, 1.0, 100.0, 40000) +TEST_INTERVAL (atan2, 1e6, 1e32, 40000) diff --git a/math/aarch64/experimental/atan2f_3u.c b/math/aarch64/experimental/atan2f_3u.c new file mode 100644 index 000000000000..245ba551566c --- /dev/null +++ b/math/aarch64/experimental/atan2f_3u.c @@ -0,0 +1,167 @@ +/* + * Single-precision scalar atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <stdbool.h> + +#include "atanf_common.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Pi (0x1.921fb6p+1f) +#define PiOver2 (0x1.921fb6p+0f) +#define PiOver4 (0x1.921fb6p-1f) +#define SignMask (0x80000000) + +/* We calculate atan2f by P(n/d), where n and d are similar to the input + arguments, and P is a polynomial. The polynomial may underflow. + POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and + d for which P underflows, and is used to special-case such inputs. */ +#define POLY_UFLOW_BOUND 24 + +static inline int32_t +biased_exponent (float f) +{ + uint32_t fi = asuint (f); + int32_t ex = (int32_t) ((fi & 0x7f800000) >> 23); + if (unlikely (ex == 0)) + { + /* Subnormal case - we still need to get the exponent right for subnormal + numbers as division may take us back inside the normal range. */ + return ex - __builtin_clz (fi << 9); + } + return ex; +} + +/* Fast implementation of scalar atan2f. Largest observed error is + 2.88ulps in [99.0, 101.0] x [99.0, 101.0]: + atan2f(0x1.9332d8p+6, 0x1.8cb6c4p+6) got 0x1.964646p-1 + want 0x1.964640p-1. */ +float +atan2f (float y, float x) +{ + uint32_t ix = asuint (x); + uint32_t iy = asuint (y); + + uint32_t sign_x = ix & SignMask; + uint32_t sign_y = iy & SignMask; + + uint32_t iax = ix & ~SignMask; + uint32_t iay = iy & ~SignMask; + + /* x or y is NaN. */ + if ((iax > 0x7f800000) || (iay > 0x7f800000)) + return x + y; + + /* m = 2 * sign(x) + sign(y). */ + uint32_t m = ((iy >> 31) & 1) | ((ix >> 30) & 2); + + /* The following follows glibc ieee754 implementation, except + that we do not use +-tiny shifts (non-nearest rounding mode). */ + + int32_t exp_diff = biased_exponent (x) - biased_exponent (y); + + /* Special case for (x, y) either on or very close to the x axis. Either y = + 0, or y is tiny and x is huge (difference in exponents >= + POLY_UFLOW_BOUND). In the second case, we only want to use this special + case when x is negative (i.e. quadrants 2 or 3). */ + if (unlikely (iay == 0 || (exp_diff >= POLY_UFLOW_BOUND && m >= 2))) + { + switch (m) + { + case 0: + case 1: + return y; /* atan(+-0,+anything)=+-0. */ + case 2: + return Pi; /* atan(+0,-anything) = pi. */ + case 3: + return -Pi; /* atan(-0,-anything) =-pi. */ + } + } + /* Special case for (x, y) either on or very close to the y axis. Either x = + 0, or x is tiny and y is huge (difference in exponents >= + POLY_UFLOW_BOUND). */ + if (unlikely (iax == 0 || exp_diff <= -POLY_UFLOW_BOUND)) + return sign_y ? -PiOver2 : PiOver2; + + /* x is INF. */ + if (iax == 0x7f800000) + { + if (iay == 0x7f800000) + { + switch (m) + { + case 0: + return PiOver4; /* atan(+INF,+INF). */ + case 1: + return -PiOver4; /* atan(-INF,+INF). */ + case 2: + return 3.0f * PiOver4; /* atan(+INF,-INF). */ + case 3: + return -3.0f * PiOver4; /* atan(-INF,-INF). */ + } + } + else + { + switch (m) + { + case 0: + return 0.0f; /* atan(+...,+INF). */ + case 1: + return -0.0f; /* atan(-...,+INF). */ + case 2: + return Pi; /* atan(+...,-INF). */ + case 3: + return -Pi; /* atan(-...,-INF). */ + } + } + } + /* y is INF. */ + if (iay == 0x7f800000) + return sign_y ? -PiOver2 : PiOver2; + + uint32_t sign_xy = sign_x ^ sign_y; + + float ax = asfloat (iax); + float ay = asfloat (iay); + + bool pred_aygtax = (ay > ax); + + /* Set up z for call to atanf. */ + float n = pred_aygtax ? -ax : ay; + float d = pred_aygtax ? ay : ax; + float z = n / d; + + float ret; + if (unlikely (m < 2 && exp_diff >= POLY_UFLOW_BOUND)) + { + /* If (x, y) is very close to x axis and x is positive, the polynomial + will underflow and evaluate to z. */ + ret = z; + } + else + { + /* Work out the correct shift. */ + float shift = sign_x ? -2.0f : 0.0f; + shift = pred_aygtax ? shift + 1.0f : shift; + shift *= PiOver2; + + ret = eval_poly (z, z, shift); + } + + /* Account for the sign of x and y. */ + return asfloat (asuint (ret) ^ sign_xy); +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (S, F, 2, atan2) +TEST_ULP (atan2f, 2.4) +TEST_INTERVAL (atan2f, -10.0, 10.0, 50000) +TEST_INTERVAL (atan2f, -1.0, 1.0, 40000) +TEST_INTERVAL (atan2f, 0.0, 1.0, 40000) +TEST_INTERVAL (atan2f, 1.0, 100.0, 40000) +TEST_INTERVAL (atan2f, 1e6, 1e32, 40000) diff --git a/math/aarch64/experimental/atan_2u5.c b/math/aarch64/experimental/atan_2u5.c new file mode 100644 index 000000000000..9c9c77d98cd3 --- /dev/null +++ b/math/aarch64/experimental/atan_2u5.c @@ -0,0 +1,73 @@ +/* + * Double-precision atan(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "test_sig.h" +#include "test_defs.h" +#include "atan_common.h" + +#define AbsMask 0x7fffffffffffffff +#define PiOver2 0x1.921fb54442d18p+0 +#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)). */ +#define BigBound 0x434 /* top12(asuint64(0x1p53)). */ +#define OneTop 0x3ff + +/* Fast implementation of double-precision atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: + atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +double +atan (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t sign = ix & ~AbsMask; + uint64_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 52; + + if (unlikely (ia12 >= BigBound || ia12 < TinyBound)) + { + if (ia12 < TinyBound) + /* Avoid underflow by returning x. */ + return x; + if (ia > 0x7ff0000000000000) + /* Propagate NaN. */ + return __math_invalid (x); + /* atan(x) rounds to PiOver2 for large x. */ + return asdouble (asuint64 (PiOver2) ^ sign); + } + + double z, az, shift; + if (ia12 >= OneTop) + { + /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x). */ + z = -1.0 / x; + shift = PiOver2; + /* Use absolute value only when needed (odd powers of z). */ + az = -fabs (z); + } + else + { + /* For x < 1, approximate atan(x) directly. */ + z = x; + shift = 0; + az = asdouble (ia); + } + + /* Calculate polynomial, shift + z + z^3 * P(z^2). */ + double y = eval_poly (z, az, shift); + /* Copy sign. */ + return asdouble (asuint64 (y) ^ sign); +} + +TEST_SIG (S, D, 1, atan, -10.0, 10.0) +TEST_ULP (atan, 1.78) +TEST_INTERVAL (atan, 0, 0x1p-30, 10000) +TEST_INTERVAL (atan, -0, -0x1p-30, 1000) +TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000) +TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000) +TEST_INTERVAL (atan, 0x1p53, inf, 10000) +TEST_INTERVAL (atan, -0x1p53, -inf, 1000) diff --git a/math/aarch64/experimental/atan_common.h b/math/aarch64/experimental/atan_common.h new file mode 100644 index 000000000000..1fd83860219b --- /dev/null +++ b/math/aarch64/experimental/atan_common.h @@ -0,0 +1,33 @@ +/* + * Double-precision polynomial evaluation function for scalar + * atan(x) and atan2(y,x). + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "poly_scalar_f64.h" + +/* Polynomial used in fast atan(x) and atan2(y,x) implementations + The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ +static inline double +eval_poly (double z, double az, double shift) +{ + /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. */ + double z2 = z * z; + double x2 = z2 * z2; + double x4 = x2 * x2; + double x8 = x4 * x4; + double y = fma (estrin_11_f64 (z2, x2, x4, x8, __atan_poly_data.poly + 8), + x8, estrin_7_f64 (z2, x2, x4, __atan_poly_data.poly)); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = fma (y, z2 * az, az); + y = y + shift; + + return y; +} + +#undef P diff --git a/math/aarch64/experimental/atan_data.c b/math/aarch64/experimental/atan_data.c new file mode 100644 index 000000000000..5d24fa912d02 --- /dev/null +++ b/math/aarch64/experimental/atan_data.c @@ -0,0 +1,23 @@ +/* + * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x). + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct atan_poly_data __atan_poly_data + = { .poly = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) + on [2**-1022, 1.0]. See atan.sollya for details of how + these were generated. */ + -0x1.5555555555555p-2, 0x1.99999999996c1p-3, + -0x1.2492492478f88p-3, 0x1.c71c71bc3951cp-4, + -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, + -0x1.aebfe7b418581p-5, 0x1.842dbe9b0d916p-5, + -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, + -0x1.0051381722a59p-6, 0x1.14e9dc19a4a4ep-7, + -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16 } }; diff --git a/math/aarch64/experimental/atanf_2u9.c b/math/aarch64/experimental/atanf_2u9.c new file mode 100644 index 000000000000..518415ded634 --- /dev/null +++ b/math/aarch64/experimental/atanf_2u9.c @@ -0,0 +1,72 @@ +/* + * Single-precision atan(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "atanf_common.h" +#include "test_sig.h" +#include "test_defs.h" + +#define PiOver2 0x1.921fb6p+0f +#define AbsMask 0x7fffffff +#define TinyBound 0x30800000 /* asuint(0x1p-30). */ +#define BigBound 0x4e800000 /* asuint(0x1p30). */ +#define One 0x3f800000 + +/* Approximation of single-precision atan(x) based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] + using z=-1/x and shift = pi/2. + Maximum error is 2.88 ulps: + atanf(0x1.0565ccp+0) got 0x1.97771p-1 + want 0x1.97770ap-1. */ +float +atanf (float x) +{ + uint32_t ix = asuint (x); + uint32_t sign = ix & ~AbsMask; + uint32_t ia = ix & AbsMask; + + if (unlikely (ia < TinyBound)) + /* Avoid underflow by returning x. */ + return x; + + if (unlikely (ia > BigBound)) + { + if (ia > 0x7f800000) + /* Propagate NaN. */ + return __math_invalidf (x); + /* atan(x) rounds to PiOver2 for large x. */ + return asfloat (asuint (PiOver2) ^ sign); + } + + float z, az, shift; + if (ia > One) + { + /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x). */ + z = -1.0f / x; + shift = PiOver2; + /* Use absolute value only when needed (odd powers of z). */ + az = -fabsf (z); + } + else + { + /* For x < 1, approximate atan(x) directly. */ + z = x; + az = asfloat (ia); + shift = 0; + } + + /* Calculate polynomial, shift + z + z^3 * P(z^2). */ + float y = eval_poly (z, az, shift); + /* Copy sign. */ + return asfloat (asuint (y) ^ sign); +} + +TEST_SIG (S, F, 1, atan, -10.0, 10.0) +TEST_ULP (atanf, 2.38) +TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000) +TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000) +TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000) +TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000) diff --git a/math/aarch64/experimental/atanf_common.h b/math/aarch64/experimental/atanf_common.h new file mode 100644 index 000000000000..3e6542047309 --- /dev/null +++ b/math/aarch64/experimental/atanf_common.h @@ -0,0 +1,38 @@ +/* + * Single-precision polynomial evaluation function for scalar + * atan(x) and atan2(y,x). + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_ATANF_COMMON_H +#define PL_MATH_ATANF_COMMON_H + +#include "math_config.h" +#include "poly_scalar_f32.h" + +/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations + The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ +static inline float +eval_poly (float z, float az, float shift) +{ + /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float z2 = z * z; + float z4 = z2 * z2; + + /* Then assemble polynomial. */ + float y = fmaf ( + z4, z4 * pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly + 4), + pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly)); + /* Finalize: + y = shift + z * P(z^2). */ + return fmaf (y, z2 * az, az) + shift; +} + +#endif // PL_MATH_ATANF_COMMON_H diff --git a/math/aarch64/experimental/atanf_data.c b/math/aarch64/experimental/atanf_data.c new file mode 100644 index 000000000000..f4d607c2a12d --- /dev/null +++ b/math/aarch64/experimental/atanf_data.c @@ -0,0 +1,17 @@ +/* + * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x). + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. + */ +const struct atanf_poly_data __atanf_poly_data + = { .poly + = { /* See atanf.sollya for details of how these were generated. */ + -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, + 0x1.01fd88p-8f } }; diff --git a/math/aarch64/experimental/atanh_3u.c b/math/aarch64/experimental/atanh_3u.c new file mode 100644 index 000000000000..d01b8bacd46a --- /dev/null +++ b/math/aarch64/experimental/atanh_3u.c @@ -0,0 +1,83 @@ +/* + * Double-precision atanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 +#define Ln2Hi 0x1.62e42fefa3800p-1 +#define Ln2Lo 0x1.ef35793c76730p-45 +#define OneMHfRt2Top \ + 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ +#define OneTop12 0x3ff +#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ +#define BottomMask 0xffffffff + +static inline double +log1p_inline (double x) +{ + /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced + interval. Copied from log1p_2u.c, with no special-case handling. See that + file for details of the algorithm. */ + double m = x + 1; + uint64_t mi = asuint64 (m); + + /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in + [sqrt(2)/2, sqrt(2)]. */ + uint32_t u = (mi >> 32) + OneMHfRt2Top; + int32_t k = (int32_t) (u >> 20) - OneTop12; + uint32_t utop = (u & 0x000fffff) + HfRt2Top; + uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask); + double f = asdouble (u_red) - 1; + + /* Correction term for round-off in f. */ + double cm = (x - (m - 1)) / m; + + /* Approximate log1p(f) with polynomial. */ + double f2 = f * f; + double f4 = f2 * f2; + double f8 = f4 * f4; + double p = fma ( + f, estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs) * f, f); + + /* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */ + double kd = k; + double y = fma (Ln2Lo, kd, cm); + return y + fma (Ln2Hi, kd, p); +} + +/* Approximation for double-precision inverse tanh(x), using a simplified + version of log1p. Greatest observed error is 3.00 ULP: + atanh(0x1.e58f3c108d714p-4) got 0x1.e7da77672a647p-4 + want 0x1.e7da77672a64ap-4. */ +double +atanh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t sign = ix & ~AbsMask; + uint64_t ia = ix & AbsMask; + + if (unlikely (ia == One)) + return __math_divzero (sign >> 32); + + if (unlikely (ia > One)) + return __math_invalid (x); + + double halfsign = asdouble (Half | sign); + double ax = asdouble (ia); + return halfsign * log1p_inline ((2 * ax) / (1 - ax)); +} + +TEST_SIG (S, D, 1, atanh, -1.0, 1.0) +TEST_ULP (atanh, 3.00) +TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (atanh, 1, inf, 100) diff --git a/math/aarch64/experimental/atanhf_3u1.c b/math/aarch64/experimental/atanhf_3u1.c new file mode 100644 index 000000000000..c452bab91f97 --- /dev/null +++ b/math/aarch64/experimental/atanhf_3u1.c @@ -0,0 +1,86 @@ +/* + * Single-precision atanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Four 0x40800000 +#define Ln2 0x1.62e43p-1f +/* asuint(0x1p-12), below which atanhf(x) rounds to x. */ +#define TinyBound 0x39800000 + +#define C(i) __log1pf_data.coeffs[i] + +static inline float +eval_poly (float m) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + float p_12 = fmaf (m, C (1), C (0)); + float p_34 = fmaf (m, C (3), C (2)); + float p_56 = fmaf (m, C (5), C (4)); + float p_78 = fmaf (m, C (7), C (6)); + + float m2 = m * m; + float p_02 = fmaf (m2, p_12, m); + float p_36 = fmaf (m2, p_56, p_34); + float p_79 = fmaf (m2, C (8), p_78); + + float m4 = m2 * m2; + float p_06 = fmaf (m4, p_36, p_02); + + return fmaf (m4 * p_79, m4, p_06); +} + +static inline float +log1pf_inline (float x) +{ + /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no + special-case handling. See that file for details of the algorithm. */ + float m = x + 1.0f; + int k = (asuint (m) - 0x3f400000) & 0xff800000; + float s = asfloat (Four - k); + float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f); + float p = eval_poly (m_scale); + float scale_back = (float) k * 0x1.0p-23f; + return fmaf (scale_back, Ln2, p); +} + +/* Approximation for single-precision inverse tanh(x), using a simplified + version of log1p. Maximum error is 3.08 ULP: + atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5 + want 0x1.ffb76ep-5. */ +float +atanhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & ~AbsMask; + + if (unlikely (iax < TinyBound)) + return x; + + if (iax == One) + return __math_divzero (sign); + + if (unlikely (iax > One)) + return __math_invalidf (x); + + float halfsign = asfloat (Half | sign); + float ax = asfloat (iax); + return halfsign * log1pf_inline ((2 * ax) / (1 - ax)); +} + +TEST_SIG (S, F, 1, atanh, -1.0, 1.0) +TEST_ULP (atanhf, 2.59) +TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500) +TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000) +TEST_SYM_INTERVAL (atanhf, 1, inf, 1000) diff --git a/math/aarch64/experimental/cbrt_2u.c b/math/aarch64/experimental/cbrt_2u.c new file mode 100644 index 000000000000..cf31627e43dc --- /dev/null +++ b/math/aarch64/experimental/cbrt_2u.c @@ -0,0 +1,69 @@ +/* + * Double-precision cbrt(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +TEST_SIG (S, D, 1, cbrt, -10.0, 10.0) + +#define AbsMask 0x7fffffffffffffff +#define TwoThirds 0x1.5555555555555p-1 + +#define C(i) __cbrt_data.poly[i] +#define T(i) __cbrt_data.table[i] + +/* Approximation for double-precision cbrt(x), using low-order polynomial and + two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +double +cbrt (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + uint64_t sign = ix & ~AbsMask; + + if (unlikely (iax == 0 || iax == 0x7ff0000000000000)) + return x; + + /* |x| = m * 2^e, where m is in [0.5, 1.0]. + We can easily decompose x into m and e using frexp. */ + int e; + double m = frexp (asdouble (iax), &e); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + double p_01 = fma (C (1), m, C (0)); + double p_23 = fma (C (3), m, C (2)); + double p = fma (p_23, m * m, p_01); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + double m_by_3 = m / 3; + double a = fma (TwoThirds, p, m_by_3 / (p * p)); + a = fma (TwoThirds, a, m_by_3 / (a * a)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. + i is an integer in [-2, 2], so t can be looked up in the table T. + Hence the result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. + Which can be done easily using ldexp. */ + return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign); +} + +TEST_ULP (cbrt, 1.30) +TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000) diff --git a/math/aarch64/experimental/cbrt_data.c b/math/aarch64/experimental/cbrt_data.c new file mode 100644 index 000000000000..dabcb6aff2d4 --- /dev/null +++ b/math/aarch64/experimental/cbrt_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and table entries for double-precision cbrt(x). + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct cbrt_data __cbrt_data + = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1]. + See cbrt.sollya for details of generation. */ + 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3}, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}}; diff --git a/math/aarch64/experimental/cbrtf_1u5.c b/math/aarch64/experimental/cbrtf_1u5.c new file mode 100644 index 000000000000..5f0288e6d27a --- /dev/null +++ b/math/aarch64/experimental/cbrtf_1u5.c @@ -0,0 +1,66 @@ +/* + * Single-precision cbrt(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define SignMask 0x80000000 +#define TwoThirds 0x1.555556p-1f + +#define T(i) __cbrtf_data.table[i] + +/* Approximation for single-precision cbrt(x), using low-order polynomial and + one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This + is observed for every value where the mantissa is 0x1.81410e and the + exponent is a multiple of 3, for example: + cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 + want 0x1.255d92p+10. */ +float +cbrtf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & SignMask; + + if (unlikely (iax == 0 || iax == 0x7f800000)) + return x; + + /* |x| = m * 2^e, where m is in [0.5, 1.0]. + We can easily decompose x into m and e using frexpf. */ + int e; + float m = frexpf (asfloat (iax), &e); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float p = pairwise_poly_3_f32 (m, m * m, __cbrtf_data.poly); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float m_by_3 = m / 3; + float a = fmaf (TwoThirds, p, m_by_3 / (p * p)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. + i is an integer in [-2, 2], so t can be looked up in the table T. + Hence the result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. + Which can be done easily using ldexpf. */ + return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign); +} + +TEST_SIG (S, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (cbrtf, 1.03) +TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000) diff --git a/math/aarch64/experimental/cbrtf_data.c b/math/aarch64/experimental/cbrtf_data.c new file mode 100644 index 000000000000..7b5c53f4a606 --- /dev/null +++ b/math/aarch64/experimental/cbrtf_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and table entries for single-precision cbrt(x). + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct cbrtf_data __cbrtf_data + = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1]. + See cbrtf.sollya for details of generation. */ + 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3}, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}}; diff --git a/math/aarch64/experimental/cosh_2u.c b/math/aarch64/experimental/cosh_2u.c new file mode 100644 index 000000000000..f5bc73b85df8 --- /dev/null +++ b/math/aarch64/experimental/cosh_2u.c @@ -0,0 +1,61 @@ +/* + * Double-precision cosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "exp_inline.h" + +#define AbsMask 0x7fffffffffffffff +#define SpecialBound \ + 0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows. */ + +static double +specialcase (double x, uint64_t iax) +{ + if (iax == 0x7ff0000000000000) + return INFINITY; + if (iax > 0x7ff0000000000000) + return __math_invalid (x); + /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated + by exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */ + double t = exp_inline (asdouble (iax) / 2, 0); + return (0.5 * t) * t; +} + +/* Approximation for double-precision cosh(x). + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the special region, 1.93 ULP: + cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. + + The greatest observed error in the non-special region is 1.03 ULP: + cosh(0x1.502cd8e56ab3bp+0) got 0x1.fe54962842d0ep+0 + want 0x1.fe54962842d0fp+0. */ +double +cosh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + + /* exp overflows a little bit before cosh, so use special-case handler for + the gap, as well as special values. */ + if (unlikely (iax >= SpecialBound)) + return specialcase (x, iax); + + double ax = asdouble (iax); + /* Use double-precision exp helper to calculate exp(x), then: + cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2). */ + double t = exp_inline (ax, 0); + return 0.5 * t + 0.5 / t; +} + +TEST_SIG (S, D, 1, cosh, -10.0, 10.0) +TEST_ULP (cosh, 1.43) +TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) +TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) +TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100) diff --git a/math/aarch64/experimental/coshf_1u9.c b/math/aarch64/experimental/coshf_1u9.c new file mode 100644 index 000000000000..b7e7720a472e --- /dev/null +++ b/math/aarch64/experimental/coshf_1u9.c @@ -0,0 +1,65 @@ +/* + * Single-precision cosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ +/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ +#define SpecialBound 0x42ad496c + +static NOINLINE float +specialcase (float x, uint32_t iax) +{ + if (iax == 0x7f800000) + return INFINITY; + if (iax > 0x7f800000) + return __math_invalidf (x); + if (iax <= TinyBound) + /* For tiny x, avoid underflow by just returning 1. */ + return 1; + /* Otherwise SpecialBound <= |x| < Inf. x is too large to calculate exp(x) + without overflow, so use exp(|x|/2) instead. For large x cosh(x) is + dominated by exp(x), so return: + cosh(x) ~= (exp(|x|/2))^2 / 2. */ + float t = expf (asfloat (iax) / 2); + return (0.5 * t) * t; +} + +/* Approximation for single-precision cosh(x) using exp. + cosh(x) = (exp(x) + exp(-x)) / 2. + The maximum error is 1.89 ULP, observed for |x| > SpecialBound: + coshf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. + The maximum error observed for TinyBound < |x| < SpecialBound is 1.02 ULP: + coshf(0x1.50a3cp+0) got 0x1.ff21dcp+0 want 0x1.ff21dap+0. */ +float +coshf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + float ax = asfloat (iax); + + if (unlikely (iax <= TinyBound || iax >= SpecialBound)) + { + /* x is tiny, large or special. */ + return specialcase (x, iax); + } + + /* Compute cosh using the definition: + coshf(x) = exp(x) / 2 + exp(-x) / 2. */ + float t = expf (ax); + return 0.5f * t + 0.5f / t; +} + +TEST_SIG (S, F, 1, cosh, -10.0, 10.0) +TEST_ULP (coshf, 1.89) +TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) diff --git a/math/aarch64/experimental/erf_2u5.c b/math/aarch64/experimental/erf_2u5.c new file mode 100644 index 000000000000..0bbe3e9548f8 --- /dev/null +++ b/math/aarch64/experimental/erf_2u5.c @@ -0,0 +1,101 @@ +/* + * Double-precision erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 +#define Shift 0x1p45 + +/* Polynomial coefficients. */ +#define OneThird 0x1.5555555555555p-2 +#define TwoThird 0x1.5555555555555p-1 + +#define TwoOverFifteen 0x1.1111111111111p-3 +#define TwoOverFive 0x1.999999999999ap-2 +#define Tenth 0x1.999999999999ap-4 + +#define TwoOverNine 0x1.c71c71c71c71cp-3 +#define TwoOverFortyFive 0x1.6c16c16c16c17p-5 +#define Sixth 0x1.555555555555p-3 + +/* Fast erf approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +double +arm_math_erf (double x) +{ + /* Get absolute value and sign. */ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & 0x7fffffffffffffff; + uint64_t sign = ix & ~0x7fffffffffffffff; + + /* |x| < 0x1p-508. Triggers exceptions. */ + if (unlikely (ia < 0x2030000000000000)) + return fma (TwoOverSqrtPiMinusOne, x, x); + + if (ia < 0x4017f80000000000) /* |x| < 6 - 1 / 128 = 5.9921875. */ + { + /* Set r to multiple of 1/128 nearest to |x|. */ + double a = asdouble (ia); + double z = a + Shift; + uint64_t i = asuint64 (z) - asuint64 (Shift); + double r = z - Shift; + /* Lookup erf(r) and scale(r) in table. + Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */ + double erfr = __v_erf_data.tab[i].erf; + double scale = __v_erf_data.tab[i].scale; + + /* erf(x) ~ erf(r) + scale * d * poly (d, r). */ + double d = a - r; + double r2 = r * r; + double d2 = d * d; + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + double p1 = -r; + double p2 = fma (TwoThird, r2, -OneThird); + double p3 = -r * fma (OneThird, r2, -0.5); + double p4 = fma (fma (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth); + double p5 + = -r * fma (fma (TwoOverFortyFive, r2, -TwoOverNine), r2, Sixth); + + double p34 = fma (p4, d, p3); + double p12 = fma (p2, d, p1); + double y = fma (p5, d2, p34); + y = fma (y, d2, p12); + + y = fma (fma (y, d2, d), scale, erfr); + return asdouble (asuint64 (y) | sign); + } + + /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ + if (unlikely (ia >= 0x7ff0000000000000)) + return (1.0 - (double) (sign >> 62)) + 1.0 / x; + + /* Boring domain (|x| >= 6.0). */ + return asdouble (sign | asuint64 (1.0)); +} + +TEST_ULP (arm_math_erf, 1.79) +TEST_SYM_INTERVAL (arm_math_erf, 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (arm_math_erf, 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (arm_math_erf, 0, inf, 40000) diff --git a/math/aarch64/experimental/erfc_1u8.c b/math/aarch64/experimental/erfc_1u8.c new file mode 100644 index 000000000000..5357e9329433 --- /dev/null +++ b/math/aarch64/experimental/erfc_1u8.c @@ -0,0 +1,153 @@ +/* + * Double-precision erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Shift 0x1p45 +#define P20 0x1.5555555555555p-2 /* 1/3. */ +#define P21 0x1.5555555555555p-1 /* 2/3. */ + +#define P40 0x1.999999999999ap-4 /* 1/10. */ +#define P41 0x1.999999999999ap-2 /* 2/5. */ +#define P42 0x1.11111111111111p-3 /* 2/15. */ + +#define P50 0x1.5555555555555p-3 /* 1/6. */ +#define P51 0x1.c71c71c71c71cp-3 /* 2/9. */ +#define P52 0x1.6c16c16c16c17p-5 /* 2/45. */ + +/* Qi = (i+1) / i. */ +#define Q5 0x1.3333333333333p0 +#define Q6 0x1.2aaaaaaaaaaabp0 +#define Q7 0x1.2492492492492p0 +#define Q8 0x1.2p0 +#define Q9 0x1.1c71c71c71c72p0 + +/* Ri = -2 * i / ((i+1)*(i+2)). */ +#define R5 -0x1.e79e79e79e79ep-3 +#define R6 -0x1.b6db6db6db6dbp-3 +#define R7 -0x1.8e38e38e38e39p-3 +#define R8 -0x1.6c16c16c16c17p-3 +#define R9 -0x1.4f2094f2094f2p-3 + +/* Fast erfc approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale(r) are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +double +erfc (double x) +{ + /* Get top words and sign. */ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & 0x7fffffffffffffff; + double a = asdouble (ia); + uint64_t sign = ix & ~0x7fffffffffffffff; + + /* erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2. */ + if (unlikely (ia >= 0x7ff0000000000000)) + return asdouble (sign >> 1) + 1.0 / x; /* Special cases. */ + + /* Return early for large enough negative values. */ + if (x < -6.0) + return 2.0; + + /* For |x| < 3487.0/128.0, the following approximation holds. */ + if (likely (ia < 0x403b3e0000000000)) + { + /* |x| < 0x1p-511 => accurate to 0.5 ULP. */ + if (unlikely (ia < asuint64 (0x1p-511))) + return 1.0 - x; + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + double z = a + Shift; + uint64_t i = asuint64 (z) - asuint64 (Shift); + double r = z - Shift; + /* These values are scaled by 2^128. */ + double erfcr = __v_erfc_data.tab[i].erfc; + double scale = __v_erfc_data.tab[i].scale; + + /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ + double d = a - r; + double d2 = d * d; + double r2 = r * r; + /* Compute p_i as a regular (low-order) polynomial. */ + double p1 = -r; + double p2 = fma (P21, r2, -P20); + double p3 = -r * fma (P20, r2, -0.5); + double p4 = fma (fma (P42, r2, -P41), r2, P40); + double p5 = -r * fma (fma (P52, r2, -P51), r2, P50); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + double p6 = fma (Q5 * r, p5, p4) * R5; + double p7 = fma (Q6 * r, p6, p5) * R6; + double p8 = fma (Q7 * r, p7, p6) * R7; + double p9 = fma (Q8 * r, p8, p7) * R8; + double p10 = fma (Q9 * r, p9, p8) * R9; + /* Compute polynomial in d using pairwise Horner scheme. */ + double p90 = fma (p10, d, p9); + double p78 = fma (p8, d, p7); + double p56 = fma (p6, d, p5); + double p34 = fma (p4, d, p3); + double p12 = fma (p2, d, p1); + double y = fma (p90, d2, p78); + y = fma (y, d2, p56); + y = fma (y, d2, p34); + y = fma (y, d2, p12); + + y = fma (-fma (y, d2, d), scale, erfcr); + + /* Handle sign and scale back in a single fma. */ + double off = asdouble (sign >> 1); + double fac = asdouble (asuint64 (0x1p-128) | sign); + y = fma (y, fac, off); + + if (unlikely (x > 26.0)) + { + /* The underflow exception needs to be signaled explicitly when + result gets into the subnormal range. */ + if (unlikely (y < 0x1p-1022)) + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + /* Set errno to ERANGE if result rounds to 0. */ + return __math_check_uflow (y); + } + + return y; + } + /* Above the threshold (x > 3487.0/128.0) erfc is constant and needs to raise + underflow exception for positive x. */ + return __math_uflow (0); +} + +TEST_SIG (S, D, 1, erfc, -6.0, 28.0) +TEST_ULP (erfc, 1.21) +TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000) +TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000) +TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000) +TEST_INTERVAL (erfc, 28.0, inf, 40000) +TEST_INTERVAL (erfc, -6.0, -inf, 40000) diff --git a/math/aarch64/experimental/erfcf_1u7.c b/math/aarch64/experimental/erfcf_1u7.c new file mode 100644 index 000000000000..e56193c8a103 --- /dev/null +++ b/math/aarch64/experimental/erfcf_1u7.c @@ -0,0 +1,103 @@ +/* + * Single-precision erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Shift 0x1p17f +#define OneThird 0x1.555556p-2f +#define TwoThird 0x1.555556p-1f + +#define TwoOverFifteen 0x1.111112p-3f +#define TwoOverFive 0x1.99999ap-2f +#define Tenth 0x1.99999ap-4f + +#define SignMask 0x7fffffff + +/* Fast erfcf approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +float +erfcf (float x) +{ + /* Get top words and sign. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & SignMask; + uint32_t sign = ix & ~SignMask; + + /* |x| < 0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */ + if (unlikely (ia < 0x32800000)) + return 1.0f - x; /* Small case. */ + + /* For |x| < 10.0625, the following approximation holds. */ + if (likely (ia < 0x41210000)) + { + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + float a = asfloat (ia); + float z = a + Shift; + uint32_t i = asuint (z) - asuint (Shift); + float r = z - Shift; + + /* These values are scaled by 2^-47. */ + float erfcr = __v_erfcf_data.tab[i].erfc; + float scale = __v_erfcf_data.tab[i].scale; + + /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ + float d = a - r; + float d2 = d * d; + float r2 = r * r; + float p1 = -r; + float p2 = fmaf (TwoThird, r2, -OneThird); + float p3 = -r * fmaf (OneThird, r2, -0.5f); + float p4 = fmaf (fmaf (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth); + float y = fmaf (p4, d, p3); + y = fmaf (y, d, p2); + y = fmaf (y, d, p1); + y = fmaf (-fmaf (y, d2, d), scale, erfcr); + /* Handle sign and scale back in a single fma. */ + float off = asfloat (sign >> 1); + float fac = asfloat (asuint (0x1p-47f) | sign); + y = fmaf (y, fac, off); + /* The underflow exception needs to be signaled explicitly when + result gets into subormnal range. */ + if (x >= 0x1.2639cp+3f) + force_eval_float (opt_barrier_float (0x1p-123f) * 0x1p-123f); + return y; + } + + /* erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2. */ + if (unlikely (ia >= 0x7f800000)) + return asfloat (sign >> 1) + 1.0f / x; /* Special cases. */ + + /* Above this threshold erfcf is constant and needs to raise underflow + exception for positive x. */ + return sign ? 2.0f : __math_uflowf (0); +} + +TEST_SIG (S, F, 1, erfc, -4.0, 10.0) +TEST_ULP (erfcf, 1.14) +TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000) +TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000) +TEST_INTERVAL (erfcf, 10.0625, inf, 40000) +TEST_INTERVAL (erfcf, -4.0, -inf, 40000) diff --git a/math/aarch64/experimental/erff_2u.c b/math/aarch64/experimental/erff_2u.c new file mode 100644 index 000000000000..9487f60dd1e3 --- /dev/null +++ b/math/aarch64/experimental/erff_2u.c @@ -0,0 +1,81 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define Shift 0x1p16f +#define OneThird 0x1.555556p-2f + +/* Fast erff approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3) ) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + ] + + This single precision implementation uses only the following terms: + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float +arm_math_erff (float x) +{ + /* Get absolute value and sign. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix & ~0x7fffffff; + + /* |x| < 0x1p-62. Triggers exceptions. */ + if (unlikely (ia < 0x20800000)) + return fmaf (TwoOverSqrtPiMinusOne, x, x); + + if (ia < 0x407b8000) /* |x| < 4 - 8 / 128 = 3.9375. */ + { + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale + to 2/sqrt(pi), when x reduced to r = 0. */ + float a = asfloat (ia); + float z = a + Shift; + uint32_t i = asuint (z) - asuint (Shift); + float r = z - Shift; + float erfr = __v_erff_data.tab[i].erf; + float scale = __v_erff_data.tab[i].scale; + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float d = a - r; + float d2 = d * d; + float y = -fmaf (OneThird, d, r); + y = fmaf (fmaf (y, d2, d), scale, erfr); + return asfloat (asuint (y) | sign); + } + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia >= 0x7f800000)) + return (1.0f - (float) (sign >> 30)) + 1.0f / x; + + /* Boring domain (|x| >= 4.0). */ + return asfloat (sign | asuint (1.0f)); +} + +TEST_ULP (arm_math_erff, 1.43) +TEST_SYM_INTERVAL (arm_math_erff, 0, 3.9375, 40000) +TEST_SYM_INTERVAL (arm_math_erff, 3.9375, inf, 40000) +TEST_SYM_INTERVAL (arm_math_erff, 0, inf, 40000) diff --git a/math/aarch64/experimental/erfinv_24u5.c b/math/aarch64/experimental/erfinv_24u5.c new file mode 100644 index 000000000000..753f38a79f66 --- /dev/null +++ b/math/aarch64/experimental/erfinv_24u5.c @@ -0,0 +1,85 @@ +/* + * Double-precision inverse error function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10]; +} data = { + .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7, + -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3, + 0x1.689181bbafd0cp-3 }, + .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7, + -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4, + 0x1p+0 }, + .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1, + 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4, + -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 }, + .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1, + 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4, + -0x1.4075c56404eecp+3, 0x1p+0 }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2, + 0x1p+0 } +}; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7 + Largest observed error is 24.46 ULP, in the extreme tail: + erfinv(0x1.fd9504351b757p-1) got 0x1.ff72c1092917p+0 + want 0x1.ff72c10929158p+0. */ +double +erfinv (double x) +{ + double a = fabs (x); + + if (a <= 0.75) + { + /* Largest observed error in this region is 6.06 ULP: + erfinv(0x1.1884650fd2d41p-2) got 0x1.fb65998cbd3fep-3 + want 0x1.fb65998cbd404p-3. */ + double t = x * x - 0.5625; + return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17); + } + + if (a <= 0.9375) + { + /* Largest observed error in this region is 6.95 ULP: + erfinv(0x1.a8d65b94d8c6p-1) got 0x1.f08325591b54p-1 + want 0x1.f08325591b547p-1. */ + double t = x * x - 0.87890625; + return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37); + } + + double t = 1.0 / (sqrt (-log (1 - a))); + return horner_8_f64 (t, data.P_57) + / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); +} + +#if USE_MPFR +# warning Not generating tests for erfinv, as MPFR has no suitable reference +#else +TEST_DISABLE_FENV (erfinv) +TEST_SIG (S, D, 1, erfinv, -0.99, 0.99) +TEST_ULP (erfinv, 24.0) +TEST_INTERVAL (erfinv, 0, 1, 40000) +TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000) +#endif diff --git a/math/aarch64/experimental/erfinvf_4u7.c b/math/aarch64/experimental/erfinvf_4u7.c new file mode 100644 index 000000000000..152994f6336a --- /dev/null +++ b/math/aarch64/experimental/erfinvf_4u7.c @@ -0,0 +1,78 @@ +/* + * Single-precision inverse error function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + float P_10[3], Q_10[4], P_29[4], Q_29[4], P_50[6], Q_50[3]; +} data = { .P_10 = { -0x1.a31268p+3, 0x1.ac9048p+4, -0x1.293ff6p+3 }, + .Q_10 = { -0x1.8265eep+3, 0x1.ef5eaep+4, -0x1.12665p+4, 0x1p+0 }, + .P_29 + = { -0x1.fc0252p-4, 0x1.119d44p+0, -0x1.f59ee2p+0, 0x1.b13626p-2 }, + .Q_29 = { -0x1.69952p-4, 0x1.c7b7d2p-1, -0x1.167d7p+1, 0x1p+0 }, + .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1, + -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 }, + .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0, 0x1p+0 } }; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7 + Largest error is 4.71 ULP, in the tail region: + erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0 + want 0x1.b83274p+0. */ +float +erfinvf (float x) +{ + if (x == 1.0f) + return __math_oflowf (0); + if (x == -1.0f) + return __math_oflowf (1); + + float a = fabsf (x); + if (a > 1.0f) + return __math_invalidf (x); + + if (a <= 0.75f) + { + /* Greatest error in this region is 4.60 ULP: + erfinvf(0x1.0a98bap-5) got 0x1.d8a93ep-6 + want 0x1.d8a948p-6. */ + float t = x * x - 0.5625f; + return x * horner_2_f32 (t, data.P_10) / horner_3_f32 (t, data.Q_10); + } + if (a < 0.9375f) + { + /* Greatest error in this region is 3.79 ULP: + erfinvf(0x1.ac82d6p-1) got 0x1.f8fc54p-1 + want 0x1.f8fc5cp-1. */ + float t = x * x - 0.87890625f; + return x * horner_3_f32 (t, data.P_29) / horner_3_f32 (t, data.Q_29); + } + + /* Tail region, where error is greatest (and sensitive to sqrt and log1p + implementations. */ + float t = 1.0 / sqrtf (-log1pf (-a)); + return horner_5_f32 (t, data.P_50) + / (copysignf (t, x) * horner_2_f32 (t, data.Q_50)); +} + +#if USE_MPFR +# warning Not generating tests for erfinvf, as MPFR has no suitable reference +#else +TEST_SIG (S, F, 1, erfinv, -0.99, 0.99) +TEST_ULP (erfinvf, 4.09) +TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000) +#endif diff --git a/math/aarch64/experimental/erfinvl.c b/math/aarch64/experimental/erfinvl.c new file mode 100644 index 000000000000..4d91410f1a5c --- /dev/null +++ b/math/aarch64/experimental/erfinvl.c @@ -0,0 +1,114 @@ +/* + * Extended precision inverse error function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define _GNU_SOURCE +#include <math.h> +#include <stdbool.h> +#include <float.h> + +#include "math_config.h" +#include "poly_scalar_f64.h" + +#define SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p0l +#define HF_SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p-1l + +const static struct +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. */ + double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10]; +} data = { + .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7, + -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3, + 0x1.689181bbafd0cp-3 }, + .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7, + -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4, + 0x1p+0 }, + .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1, + 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4, + -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 }, + .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1, + 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4, + -0x1.4075c56404eecp+3, 0x1p+0 }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2, + 0x1p+0 } +}; + +/* Inverse error function approximation, based on rational approximation as + described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. */ +static inline double +__erfinv (double x) +{ + if (x == 1.0) + return __math_oflow (0); + if (x == -1.0) + return __math_oflow (1); + + double a = fabs (x); + if (a > 1) + return __math_invalid (x); + + if (a <= 0.75) + { + double t = x * x - 0.5625; + return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17); + } + + if (a <= 0.9375) + { + double t = x * x - 0.87890625; + return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37); + } + + double t = 1.0 / (sqrtl (-log1pl (-a))); + return horner_8_f64 (t, data.P_57) + / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); +} + +/* Extended-precision variant, which uses the above (or asymptotic estimate) as + starting point for Newton refinement. This implementation is a port to C of + the version in the SpecialFunctions.jl Julia package, with relaxed stopping + criteria for the Newton refinement. */ +long double +erfinvl (long double x) +{ + if (x == 0) + return 0; + + double yf = __erfinv (x); + long double y; + if (isfinite (yf)) + y = yf; + else + { + /* Double overflowed, use asymptotic estimate instead. */ + y = copysignl (sqrtl (-logl (1.0l - fabsl (x)) * SQRT_PIl), x); + if (!isfinite (y)) + return y; + } + + double eps = fabs (yf - nextafter (yf, 0)); + while (true) + { + long double dy = HF_SQRT_PIl * (erfl (y) - x) * exp (y * y); + y -= dy; + /* Stopping criterion is different to Julia implementation, but is enough + to ensure result is accurate when rounded to double-precision. */ + if (fabsl (dy) < eps) + break; + } + return y; +} diff --git a/math/aarch64/experimental/exp_inline.h b/math/aarch64/experimental/exp_inline.h new file mode 100644 index 000000000000..1a327c1e67d3 --- /dev/null +++ b/math/aarch64/experimental/exp_inline.h @@ -0,0 +1,159 @@ +/* + * Double-precision e^x function. + * + * Copyright (c) 2018-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_EXP_INLINE_H +#define PL_MATH_EXP_INLINE_H + +#include <float.h> +#include <math.h> +#include <stdint.h> +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) +#define InvLn2N __exp_data.invln2N +#define NegLn2hiN __exp_data.negln2hiN +#define NegLn2loN __exp_data.negln2loN +#define Shift __exp_data.shift +#define T __exp_data.tab +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER] +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER] +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER] +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER] +#define C6 __exp_data.poly[9 - EXP_POLY_ORDER] + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +exp_inline_special_case (double_t tmp, uint64_t sbits, uint64_t ki) +{ + double_t scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble (sbits); + y = scale + scale * tmp; + if (y < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t hi, lo; + lo = scale - y + scale * tmp; + hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double (hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + return check_uflow (eval_as_double (y)); +} + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + If hastail is 0 then xtail is assumed to be 0 too. */ +static inline double +exp_inline (double x, double xtail) +{ + uint32_t abstop; + uint64_t ki, idx, top, sbits; + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t kd, z, r, r2, scale, tail, tmp; + + abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54))) + { + if (abstop - top12 (0x1p-54) >= 0x80000000) + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return WANT_ROUNDING ? 1.0 + x : 1.0; + if (abstop >= top12 (1024.0)) + { + if (asuint64 (x) == asuint64 (-INFINITY)) + return 0.0; + if (abstop >= top12 (INFINITY)) + return 1.0 + x; + if (asuint64 (x) >> 63) + return __math_uflow (0); + else + return __math_oflow (0); + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + z = InvLn2N * x; +#if TOINT_INTRINSICS + kd = roundtoint (z); + ki = converttoint (z); +#elif EXP_USE_TOINT_NARROW + /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes. */ + kd = eval_as_double (z + Shift); + ki = asuint64 (kd) >> 16; + kd = (double_t) (int32_t) ki; +#else + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + kd = eval_as_double (z + Shift); + ki = asuint64 (kd); + kd -= Shift; +#endif + r = x + kd * NegLn2hiN + kd * NegLn2loN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + if (!__builtin_constant_p (xtail) || xtail != 0.0) + r += xtail; + /* 2^(k/N) ~= scale * (1 + tail). */ + idx = 2 * (ki % N); + top = ki << (52 - EXP_TABLE_BITS); + tail = asdouble (T[idx]); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + sbits = T[idx + 1] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; + /* Without fma the worst case error is 0.25/N ulp larger. */ + /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp. */ +#if EXP_POLY_ORDER == 4 + tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4); +#elif EXP_POLY_ORDER == 5 + tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); +#elif EXP_POLY_ORDER == 6 + tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6); +#endif + if (unlikely (abstop == 0)) + return exp_inline_special_case (tmp, sbits, ki); + scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double (scale + scale * tmp); +} + +#endif diff --git a/math/aarch64/experimental/expf_data.c b/math/aarch64/experimental/expf_data.c new file mode 100644 index 000000000000..958f705cc676 --- /dev/null +++ b/math/aarch64/experimental/expf_data.c @@ -0,0 +1,31 @@ +/* + * Coeffs and table entries for single-precision exp. Copied from + * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32. + * + * Copyright (c) 2017-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXPF_TABLE_BITS) + +const struct expf_data __expf_data = { + /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) + used for computing 2^(k/N) for an int |k| < 150 N as + double(tab[k%N] + (k << 52-BITS)). */ + .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, + .invln2_scaled = 0x1.71547652b82fep+0 * N, + .poly_scaled = { + 0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, + }, +}; diff --git a/math/aarch64/experimental/expm1_2u5.c b/math/aarch64/experimental/expm1_2u5.c new file mode 100644 index 000000000000..a4805e832af3 --- /dev/null +++ b/math/aarch64/experimental/expm1_2u5.c @@ -0,0 +1,85 @@ +/* + * Double-precision e^x - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f64.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define InvLn2 0x1.71547652b82fep0 +#define Ln2hi 0x1.62e42fefa39efp-1 +#define Ln2lo 0x1.abc9e3b39803fp-56 +#define Shift 0x1.8p52 +/* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ +#define TinyBound 0x3cc0000000000000 +/* Above which expm1(x) overflows. */ +#define BigBound 0x1.63108c75a1937p+9 +/* Below which expm1(x) rounds to 1. */ +#define NegBound -0x1.740bf7c0d927dp+9 +#define AbsMask 0x7fffffffffffffff + +/* Approximation for exp(x) - 1 using polynomial on a reduced interval. + The maximum error observed error is 2.17 ULP: + expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2 + want 0x1.a9af566038788p-2. */ +double +expm1 (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ax = ix & AbsMask; + + /* Tiny, +Infinity. */ + if (ax <= TinyBound || ix == 0x7ff0000000000000) + return x; + + /* +/-NaN. */ + if (ax > 0x7ff0000000000000) + return __math_invalid (x); + + /* Result is too large to be represented as a double. */ + if (x >= 0x1.63108c75a1937p+9) + return __math_oflow (0); + + /* Result rounds to -1 in double precision. */ + if (x <= NegBound) + return -1; + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + double j = fma (InvLn2, x, Shift) - Shift; + int64_t i = j; + double f = fma (j, -Ln2hi, x); + f = fma (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + double f2 = f * f; + double f4 = f2 * f2; + double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f); + + /* Assemble the result, using a slight rearrangement to achieve acceptable + accuracy. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^(i - 1). */ + double t = ldexp (0.5, i); + /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ + return 2 * fma (p, t, t - 0.5); +} + +TEST_SIG (S, D, 1, expm1, -9.9, 9.9) +TEST_ULP (expm1, 1.68) +TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000) +TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000) +TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) +TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100) +TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100) diff --git a/math/aarch64/experimental/expm1_data.c b/math/aarch64/experimental/expm1_data.c new file mode 100644 index 000000000000..955895056924 --- /dev/null +++ b/math/aarch64/experimental/expm1_data.c @@ -0,0 +1,21 @@ +/* + * Coefficients for double-precision e^x - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Generated using fpminimax, see tools/expm1.sollya for details. */ +const double __expm1_poly[] = { 0x1p-1, + 0x1.5555555555559p-3, + 0x1.555555555554bp-5, + 0x1.111111110f663p-7, + 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, + 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, + 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, + 0x1.1f143d060a28ap-29 }; diff --git a/math/aarch64/experimental/expm1f_1u6.c b/math/aarch64/experimental/expm1f_1u6.c new file mode 100644 index 000000000000..03d1e9dc31ef --- /dev/null +++ b/math/aarch64/experimental/expm1f_1u6.c @@ -0,0 +1,79 @@ +/* + * Single-precision e^x - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Shift (0x1.8p23f) +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) +#define AbsMask (0x7fffffff) +#define InfLimit \ + (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows. */ +#define NegLimit \ + (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */ + +/* Approximation for exp(x) - 1 using polynomial on a reduced interval. + The maximum error is 1.51 ULP: + expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 + want 0x1.e2fb94p-2. */ +float +expm1f (float x) +{ + uint32_t ix = asuint (x); + uint32_t ax = ix & AbsMask; + + /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x. + Inf: x == +Inf => expm1(x) = x. */ + if (ax <= 0x34000000 || (ix == 0x7f800000)) + return x; + + /* +/-NaN. */ + if (ax > 0x7f800000) + return __math_invalidf (x); + + if (x >= InfLimit) + return __math_oflowf (0); + + if (x <= NegLimit || ix == 0xff800000) + return -1; + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float j = fmaf (InvLn2, x, Shift) - Shift; + int32_t i = j; + float f = fmaf (j, -Ln2hi, x); + f = fmaf (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float p = fmaf (f * f, horner_4_f32 (f, __expm1f_poly), f); + /* Assemble the result, using a slight rearrangement to achieve acceptable + accuracy. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^(i - 1). */ + float t = ldexpf (0.5f, i); + /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ + return 2 * fmaf (p, t, t - 0.5f); +} + +TEST_SIG (S, F, 1, expm1, -9.9, 9.9) +TEST_ULP (expm1f, 1.02) +TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000) +TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000) +TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000) +TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000) +TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000) diff --git a/math/aarch64/experimental/expm1f_data.c b/math/aarch64/experimental/expm1f_data.c new file mode 100644 index 000000000000..92d9189ff503 --- /dev/null +++ b/math/aarch64/experimental/expm1f_data.c @@ -0,0 +1,12 @@ +/* + * Coefficients for single-precision e^x - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Generated using fpminimax, see tools/expm1f.sollya for details. */ +const float __expm1f_poly[] = { 0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5, + 0x1.12287cp-7, 0x1.6b55a2p-10 }; diff --git a/math/aarch64/experimental/log10_2u.c b/math/aarch64/experimental/log10_2u.c new file mode 100644 index 000000000000..84ee1544fe1a --- /dev/null +++ b/math/aarch64/experimental/log10_2u.c @@ -0,0 +1,151 @@ +/* + * Double-precision log10(x) function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Polynomial coefficients and lookup tables. */ +#define T __log10_data.tab +#define T2 __log10_data.tab2 +#define B __log10_data.poly1 +#define A __log10_data.poly +#define Ln2hi __log10_data.ln2hi +#define Ln2lo __log10_data.ln2lo +#define InvLn10 __log10_data.invln10 +#define N (1 << LOG10_TABLE_BITS) +#define OFF 0x3fe6000000000000 +#define LO asuint64 (1.0 - 0x1p-4) +#define HI asuint64 (1.0 + 0x1.09p-4) + +/* Top 16 bits of a double. */ +static inline uint32_t +top16 (double x) +{ + return asuint64 (x) >> 48; +} + +/* Fast and low accuracy implementation of log10. + The implementation is similar to that of math/log, except that: + - Polynomials are computed for log10(1+r) with r on same intervals as log. + - Lookup parameters are scaled (at runtime) to switch from base e to + base 10. Many errors above 1.59 ulp are observed across the whole range of + doubles. The greatest observed error is 1.61 ulp, at around 0.965: + log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6 + want -0x1.fee26884905a8p-6. */ +double +log10 (double x) +{ + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64 (x); + top = top16 (x); + + if (unlikely (ix - LO < HI - LO)) + { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0))) + return 0; + r = x - 1.0; + r2 = r * r; + r3 = r * r2; + y = r3 + * (B[1] + r * B[2] + r2 * B[3] + + r3 + * (B[4] + r * B[5] + r2 * B[6] + + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); + /* Worst-case error is around 0.507 ULP. */ + w = r * 0x1p27; + double_t rhi = r + w - w; + double_t rlo = r - rhi; + w = rhi * rhi * B[0]; + hi = r + w; + lo = r - hi + w; + lo += B[0] * rlo * (rhi + r); + y += lo; + y += hi; + /* Scale by 1/ln(10). Polynomial already contains scaling. */ + y = y * InvLn10; + + return eval_as_double (y); + } + if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010)) + { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero (1); + if (ix == asuint64 (INFINITY)) /* log10(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid (x); + /* x is subnormal, normalize it. */ + ix = asuint64 (x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG10_TABLE_BITS)) % N; + k = (int64_t) tmp >> 52; /* arithmetic shift. */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble (iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if HAVE_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = fma (z, invc, -1.0); +#else + /* rounding error: 0x1p-55/N + 0x1p-66. */ + r = (z - T2[i].chi - T2[i].clo) * invc; +#endif + kd = (double_t) k; + + /* w = log(c) + k*Ln2hi. */ + w = kd * Ln2hi + logc; + hi = w + r; + lo = w - hi + r + kd * Ln2lo; + + /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)). */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + + /* Scale by 1/ln(10). Polynomial already contains scaling. */ + y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + + hi; + y = y * InvLn10; + + return eval_as_double (y); +} + +// clang-format off +#if USE_GLIBC_ABI +strong_alias (log10, __log10_finite) +hidden_alias (log10, __ieee754_log10) +#if LDBL_MANT_DIG == 53 +long double +log10l (long double x) +{ + return log10 (x); +} +#endif +#endif +// clang-format on + +TEST_SIG (S, D, 1, log10, 0.01, 11.1) +TEST_ULP (log10, 1.11) +TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000) +TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000) +TEST_INTERVAL (log10, 0, inf, 40000) diff --git a/math/aarch64/experimental/log10_data.c b/math/aarch64/experimental/log10_data.c new file mode 100644 index 000000000000..20b5ef883ed8 --- /dev/null +++ b/math/aarch64/experimental/log10_data.c @@ -0,0 +1,337 @@ +/* + * Data for log10. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << LOG10_TABLE_BITS) + +const struct log10_data __log10_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.invln10 = 0x1.bcb7b1526e50ep-2, +.poly1 = { +#if LOG10_POLY1_ORDER == 12 +// relative error: 0x1.c04d76cp-63 +// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) +-0x1p-1, +0x1.5555555555577p-2, +-0x1.ffffffffffdcbp-3, +0x1.999999995dd0cp-3, +-0x1.55555556745a7p-3, +0x1.24924a344de3p-3, +-0x1.fffffa4423d65p-4, +0x1.c7184282ad6cap-4, +-0x1.999eb43b068ffp-4, +0x1.78182f7afd085p-4, +-0x1.5521375d145cdp-4, +#endif +}, +.poly = { +#if N == 128 && LOG10_POLY_ORDER == 6 +// relative error: 0x1.926199e8p-56 +// abs error: 0x1.882ff33p-65 +// in -0x1.fp-9 0x1.fp-9 +-0x1.0000000000001p-1, +0x1.555555551305bp-2, +-0x1.fffffffeb459p-3, +0x1.999b324f10111p-3, +-0x1.55575e506c89fp-3, +#endif +}, +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p9 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-66 and + 3) the rounding error in (double)log(c) is minimized (< 0x1p-66). + +Note: 1) ensures that k*ln2hi + logc can be computed without rounding error, +2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to +a single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +#if N == 128 +{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2}, +{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2}, +{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2}, +{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2}, +{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2}, +{0x1.69147332f0cbap+0, -0x1.602d076180000p-2}, +{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2}, +{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2}, +{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2}, +{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2}, +{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2}, +{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2}, +{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2}, +{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2}, +{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2}, +{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2}, +{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2}, +{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2}, +{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2}, +{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2}, +{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2}, +{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2}, +{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2}, +{0x1.4880524d48434p+0, -0x1.feb224586f000p-3}, +{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3}, +{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3}, +{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3}, +{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3}, +{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3}, +{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3}, +{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3}, +{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3}, +{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3}, +{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3}, +{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3}, +{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3}, +{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3}, +{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3}, +{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3}, +{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3}, +{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3}, +{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3}, +{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3}, +{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3}, +{0x1.293726014b530p+0, -0x1.31b996b490000p-3}, +{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3}, +{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3}, +{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3}, +{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3}, +{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3}, +{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4}, +{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4}, +{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4}, +{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4}, +{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4}, +{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4}, +{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4}, +{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4}, +{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4}, +{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4}, +{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4}, +{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4}, +{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4}, +{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4}, +{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5}, +{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5}, +{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5}, +{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5}, +{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5}, +{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5}, +{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5}, +{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5}, +{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6}, +{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6}, +{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6}, +{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6}, +{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7}, +{0x1.02865137932a9p+0, -0x1.419355daa0000p-7}, +{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8}, +{0x1.008040614b195p+0, -0x1.0040979240000p-9}, +{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9}, +{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7}, +{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6}, +{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6}, +{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5}, +{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5}, +{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5}, +{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5}, +{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4}, +{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4}, +{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4}, +{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4}, +{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4}, +{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4}, +{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4}, +{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4}, +{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4}, +{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3}, +{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3}, +{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3}, +{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3}, +{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3}, +{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3}, +{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3}, +{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3}, +{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3}, +{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3}, +{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3}, +{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3}, +{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3}, +{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3}, +{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3}, +{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3}, +{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3}, +{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3}, +{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3}, +{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2}, +{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2}, +{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2}, +{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2}, +{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2}, +{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2}, +{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2}, +{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2}, +{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2}, +{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2}, +{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2}, +{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2}, +#endif +}, +#if !HAVE_FAST_FMA +.tab2 = { +#if N == 128 +{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56}, +{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55}, +{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55}, +{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57}, +{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56}, +{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55}, +{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55}, +{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56}, +{0x1.710000e86978p-1, 0x1.bff6671097952p-56}, +{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55}, +{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57}, +{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57}, +{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55}, +{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56}, +{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55}, +{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55}, +{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55}, +{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55}, +{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55}, +{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55}, +{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55}, +{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56}, +{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55}, +{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55}, +{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55}, +{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56}, +{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55}, +{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56}, +{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55}, +{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55}, +{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60}, +{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55}, +{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56}, +{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55}, +{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55}, +{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55}, +{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55}, +{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57}, +{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55}, +{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57}, +{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58}, +{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56}, +{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56}, +{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55}, +{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56}, +{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57}, +{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57}, +{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55}, +{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55}, +{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57}, +{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55}, +{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55}, +{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56}, +{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57}, +{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55}, +{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55}, +{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56}, +{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55}, +{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58}, +{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56}, +{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56}, +{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55}, +{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55}, +{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57}, +{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56}, +{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56}, +{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56}, +{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58}, +{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55}, +{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56}, +{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58}, +{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55}, +{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59}, +{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55}, +{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55}, +{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57}, +{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56}, +{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57}, +{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56}, +{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57}, +{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55}, +{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54}, +{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54}, +{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55}, +{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57}, +{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54}, +{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55}, +{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56}, +{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55}, +{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54}, +{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54}, +{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55}, +{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54}, +{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54}, +{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57}, +{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54}, +{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54}, +{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54}, +{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56}, +{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56}, +{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56}, +{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54}, +{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55}, +{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55}, +{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55}, +{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54}, +{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54}, +{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55}, +{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54}, +{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55}, +{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56}, +{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54}, +{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57}, +{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55}, +{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55}, +{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54}, +{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54}, +{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54}, +{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54}, +{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54}, +{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57}, +{0x1.530001605277ap+0, -0x1.6bfcece233209p-54}, +{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55}, +{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54}, +{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55}, +{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54}, +{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54}, +{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, +#endif +}, +#endif /* !HAVE_FAST_FMA. */ +}; diff --git a/math/aarch64/experimental/log1p_2u.c b/math/aarch64/experimental/log1p_2u.c new file mode 100644 index 000000000000..a1ff309ecb5f --- /dev/null +++ b/math/aarch64/experimental/log1p_2u.c @@ -0,0 +1,131 @@ +/* + * Double-precision log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f64.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Ln2Hi 0x1.62e42fefa3800p-1 +#define Ln2Lo 0x1.ef35793c76730p-45 +#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ +#define OneMHfRt2Top \ + 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ +#define OneTop12 0x3ff +#define BottomMask 0xffffffff +#define OneMHfRt2 0x3fd2bec333018866 +#define Rt2MOne 0x3fda827999fcef32 +#define AbsMask 0x7fffffffffffffff +#define ExpM63 0x3c00 + +static inline double +eval_poly (double f) +{ + double f2 = f * f; + double f4 = f2 * f2; + double f8 = f4 * f4; + return estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs); +} + +/* log1p approximation using polynomial on reduced interval. Largest + observed errors are near the lower boundary of the region where k + is 0. + Maximum measured error: 1.75ULP. + log1p(-0x1.2e1aea97b3e5cp-2) got -0x1.65fb8659a2f9p-2 + want -0x1.65fb8659a2f92p-2. */ +double +log1p (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint32_t ia16 = ia >> 48; + + /* Handle special cases first. */ + if (unlikely (ia16 >= 0x7ff0 || ix >= 0xbff0000000000000 + || ix == 0x8000000000000000)) + { + if (ix == 0x8000000000000000 || ix == 0x7ff0000000000000) + { + /* x == -0 => log1p(x) = -0. + x == Inf => log1p(x) = Inf. */ + return x; + } + if (ix == 0xbff0000000000000) + { + /* x == -1 => log1p(x) = -Inf. */ + return __math_divzero (-1); + ; + } + if (ia16 >= 0x7ff0) + { + /* x == +/-NaN => log1p(x) = NaN. */ + return __math_invalid (asdouble (ia)); + } + /* x < -1 => log1p(x) = NaN. + x == -Inf => log1p(x) = NaN. */ + return __math_invalid (x); + } + + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f + is in [sqrt(2)/2, sqrt(2)]): + log1p(x) = k*log(2) + log1p(f). + + f may not be representable exactly, so we need a correction term: + let m = round(1 + x), c = (1 + x) - m. + c << m: at very small x, log1p(x) ~ x, hence: + log(1+x) - log(m) ~ c/m. + + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ + + uint64_t sign = ix & ~AbsMask; + if (ia <= OneMHfRt2 || (!sign && ia <= Rt2MOne)) + { + if (unlikely (ia16 <= ExpM63)) + { + /* If exponent of x <= -63 then shortcut the polynomial and avoid + underflow by just returning x, which is exactly rounded in this + region. */ + return x; + } + /* If x is in [sqrt(2)/2 - 1, sqrt(2) - 1] then we can shortcut all the + logic below, as k = 0 and f = x and therefore representable exactly. + All we need is to return the polynomial. */ + return fma (x, eval_poly (x) * x, x); + } + + /* Obtain correctly scaled k by manipulation in the exponent. */ + double m = x + 1; + uint64_t mi = asuint64 (m); + uint32_t u = (mi >> 32) + OneMHfRt2Top; + int32_t k = (int32_t) (u >> 20) - OneTop12; + + /* Correction term c/m. */ + double cm = (x - (m - 1)) / m; + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + uint32_t utop = (u & 0x000fffff) + HfRt2Top; + uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask); + double f = asdouble (u_red) - 1; + + /* Approximate log1p(x) on the reduced input using a polynomial. Because + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... */ + double p = fma (f, eval_poly (f) * f, f); + + double kd = k; + double y = fma (Ln2Lo, kd, cm); + return y + fma (Ln2Hi, kd, p); +} + +TEST_SIG (S, D, 1, log1p, -0.9, 10.0) +TEST_ULP (log1p, 1.26) +TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000) +TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000) diff --git a/math/aarch64/experimental/log1p_data.c b/math/aarch64/experimental/log1p_data.c new file mode 100644 index 000000000000..91a7196d795f --- /dev/null +++ b/math/aarch64/experimental/log1p_data.c @@ -0,0 +1,20 @@ +/* + * Data used in double-precision log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Polynomial coefficients generated using Remez algorithm, see + log1p.sollya for details. */ +const struct log1p_data __log1p_data + = { .coeffs + = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6 } }; diff --git a/math/aarch64/experimental/log1pf_2u1.c b/math/aarch64/experimental/log1pf_2u1.c new file mode 100644 index 000000000000..fe4f93865220 --- /dev/null +++ b/math/aarch64/experimental/log1pf_2u1.c @@ -0,0 +1,161 @@ +/* + * Single-precision log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "poly_scalar_f32.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Ln2 (0x1.62e43p-1f) +#define SignMask (0x80000000) + +/* Biased exponent of the largest float m for which m^8 underflows. */ +#define M8UFLOW_BOUND_BEXP 112 +/* Biased exponent of the largest float for which we just return x. */ +#define TINY_BOUND_BEXP 103 + +#define C(i) __log1pf_data.coeffs[i] + +static inline float +eval_poly (float m, uint32_t e) +{ +#ifdef LOG1PF_2U5 + + /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using + slightly modified Estrin scheme (no x^0 term, and x term is just x). */ + float p_12 = fmaf (m, C (1), C (0)); + float p_34 = fmaf (m, C (3), C (2)); + float p_56 = fmaf (m, C (5), C (4)); + float p_78 = fmaf (m, C (7), C (6)); + + float m2 = m * m; + float p_02 = fmaf (m2, p_12, m); + float p_36 = fmaf (m2, p_56, p_34); + float p_79 = fmaf (m2, C (8), p_78); + + float m4 = m2 * m2; + float p_06 = fmaf (m4, p_36, p_02); + + if (unlikely (e < M8UFLOW_BOUND_BEXP)) + return p_06; + + float m8 = m4 * m4; + return fmaf (m8, p_79, p_06); + +#elif defined(LOG1PF_1U3) + + /* 1.3 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Horner + scheme. Our polynomial approximation for log1p has the form + x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ... + Hence approximation has the form m + m^2 * P(m) + where P(x) = C1 + C2 * x + C3 * x^2 + ... . */ + return fmaf (m, m * horner_8_f32 (m, __log1pf_data.coeffs), m); + +#else +#error No log1pf approximation exists with the requested precision. Options are 13 or 25. +#endif +} + +static inline uint32_t +biased_exponent (uint32_t ix) +{ + return (ix & 0x7f800000) >> 23; +} + +/* log1pf approximation using polynomial on reduced interval. Worst-case error + when using Estrin is roughly 2.02 ULP: + log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ +float +log1pf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & ~SignMask; + uint32_t ia12 = ia >> 20; + uint32_t e = biased_exponent (ix); + + /* Handle special cases first. */ + if (unlikely (ia12 >= 0x7f8 || ix >= 0xbf800000 || ix == 0x80000000 + || e <= TINY_BOUND_BEXP)) + { + if (ix == 0xff800000) + { + /* x == -Inf => log1pf(x) = NaN. */ + return NAN; + } + if ((ix == 0x7f800000 || e <= TINY_BOUND_BEXP) && ia12 <= 0x7f8) + { + /* |x| < TinyBound => log1p(x) = x. + x == Inf => log1pf(x) = Inf. */ + return x; + } + if (ix == 0xbf800000) + { + /* x == -1.0 => log1pf(x) = -Inf. */ + return __math_divzerof (-1); + } + if (ia12 >= 0x7f8) + { + /* x == +/-NaN => log1pf(x) = NaN. */ + return __math_invalidf (asfloat (ia)); + } + /* x < -1.0 => log1pf(x) = NaN. */ + return __math_invalidf (x); + } + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + + if (ix <= 0x3f000000 || ia <= 0x3e800000) + { + /* If x is in [-0.25, 0.5] then we can shortcut all the logic + below, as k = 0 and m = x. All we need is to return the + polynomial. */ + return eval_poly (x, e); + } + + float m = x + 1.0f; + + /* k is used scale the input. 0x3f400000 is chosen as we are trying to + reduce x to the range [-0.25, 0.5]. Inside this range, k is 0. + Outside this range, if k is reinterpreted as (NOT CONVERTED TO) float: + let k = sign * 2^p where sign = -1 if x < 0 + 1 otherwise + and p is a negative integer whose magnitude increases with the + magnitude of x. */ + int k = (asuint (m) - 0x3f400000) & 0xff800000; + + /* By using integer arithmetic, we obtain the necessary scaling by + subtracting the unbiased exponent of k from the exponent of x. */ + float m_scale = asfloat (asuint (x) - k); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number (s in [2**-126,2**26]), and scale m down accordingly. */ + float s = asfloat (asuint (4.0f) - k); + m_scale = m_scale + fmaf (0.25f, s, -1.0f); + + float p = eval_poly (m_scale, biased_exponent (asuint (m_scale))); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + float scale_back = (float) k * 0x1.0p-23f; + + /* Apply the scaling back. */ + return fmaf (scale_back, Ln2, p); +} + +TEST_SIG (S, F, 1, log1p, -0.9, 10.0) +TEST_ULP (log1pf, 1.52) +TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000) +TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000) diff --git a/math/aarch64/experimental/log1pf_data.c b/math/aarch64/experimental/log1pf_data.c new file mode 100644 index 000000000000..e0ac269a1069 --- /dev/null +++ b/math/aarch64/experimental/log1pf_data.c @@ -0,0 +1,14 @@ +/* + * Data used in single-precision log1p(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" + +/* Polynomial coefficients generated using floating-point minimax + algorithm, see tools/log1pf.sollya for details. */ +const struct log1pf_data __log1pf_data + = { .coeffs = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, + 0x1.abcb6p-4f, -0x1.6f0d5ep-5f } }; diff --git a/math/aarch64/experimental/sinh_3u.c b/math/aarch64/experimental/sinh_3u.c new file mode 100644 index 000000000000..39030d2750a9 --- /dev/null +++ b/math/aarch64/experimental/sinh_3u.c @@ -0,0 +1,60 @@ +/* + * Double-precision sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "exp_inline.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +/* 0x1.62e42fefa39fp+9, above which using expm1 results in NaN. */ +#define OFlowBound 0x40862e42fefa39f0 + +/* Approximation for double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.57 ULP: + __v_sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 + want 0x1.ab34e59d678d9p-2. */ +double +sinh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + double ax = asdouble (iax); + uint64_t sign = ix & ~AbsMask; + double halfsign = asdouble (Half | sign); + + if (unlikely (iax >= OFlowBound)) + { + /* Special values and overflow. */ + if (unlikely (iax > 0x7ff0000000000000)) + return __math_invalidf (x); + /* expm1 overflows a little before sinh. We have to fill this + gap by using a different algorithm, in this case we use a + double-precision exp helper. For large x sinh(x) is dominated + by exp(x), however we cannot compute exp without overflow + either. We use the identity: exp(a) = (exp(a / 2)) ^ 2 + to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 + ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. */ + double e = exp_inline (ax / 2, 0); + return (e * halfsign) * e; + } + + /* Use expm1f to retain acceptable precision for small numbers. + Let t = e^(|x|) - 1. */ + double t = expm1 (ax); + /* Then sinh(x) = (t + t / (t + 1)) / 2 for x > 0 + (t + t / (t + 1)) / -2 for x < 0. */ + return (t + t / (t + 1)) * halfsign; +} + +TEST_SIG (S, D, 1, sinh, -10.0, 10.0) +TEST_ULP (sinh, 2.08) +TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100) +TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) +TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) diff --git a/math/aarch64/experimental/sinhf_2u3.c b/math/aarch64/experimental/sinhf_2u3.c new file mode 100644 index 000000000000..860ddc0fc83c --- /dev/null +++ b/math/aarch64/experimental/sinhf_2u3.c @@ -0,0 +1,69 @@ +/* + * Single-precision sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +/* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f overflows. */ +#define Expm1OFlowLimit 0x42b17218 +/* 0x1.65a9fap+6, minimum positive value for which sinhf should overflow. */ +#define OFlowLimit 0x42b2d4fd + +/* Approximation for single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ +float +sinhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + float ax = asfloat (iax); + uint32_t sign = ix & ~AbsMask; + float halfsign = asfloat (Half | sign); + + if (unlikely (iax >= Expm1OFlowLimit)) + { + /* Special values and overflow. */ + if (iax >= 0x7fc00001 || iax == 0x7f800000) + return x; + if (iax >= 0x7f800000) + return __math_invalidf (x); + if (iax >= OFlowLimit) + return __math_oflowf (sign); + + /* expm1f overflows a little before sinhf, (~88.7 vs ~89.4). We have to + fill this gap by using a different algorithm, in this case we use a + double-precision exp helper. For large x sinh(x) dominated by exp(x), + however we cannot compute exp without overflow either. We use the + identity: + exp(a) = (exp(a / 2)) ^ 2. + to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 + ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. + Greatest error in this region is 1.89 ULP: + sinhf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. */ + float e = expf (ax / 2); + return (e * halfsign) * e; + } + + /* Use expm1f to retain acceptable precision for small numbers. + Let t = e^(|x|) - 1. */ + float t = expm1f (ax); + /* Then sinh(x) = (t + t / (t + 1)) / 2 for x > 0 + (t + t / (t + 1)) / -2 for x < 0. */ + return (t + t / (t + 1)) * halfsign; +} + +TEST_SIG (S, F, 1, sinh, -10.0, 10.0) +TEST_ULP (sinhf, 1.76) +TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) +TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) +TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) diff --git a/math/aarch64/experimental/sve/erfinv_25u.c b/math/aarch64/experimental/sve/erfinv_25u.c new file mode 100644 index 000000000000..4de6d08ab80f --- /dev/null +++ b/math/aarch64/experimental/sve/erfinv_25u.c @@ -0,0 +1,156 @@ +/* + * Double-precision inverse error function (SVE variant). + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "sv_math.h" +#include "test_defs.h" +#include "math_config.h" +#include "test_sig.h" +#include "sv_poly_f64.h" +#define SV_LOG_INLINE_POLY_ORDER 4 +#include "sv_log_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. P is interleaved P_17 and P_37, similar for Q. */ + double P[7][2], Q[7][2]; + double P_57[9], Q_57[9], tailshift, P37_0; + struct sv_log_inline_data log_tbl; +} data = { + .P37_0 = -0x1.f3596123109edp-7, + .tailshift = -0.87890625, + .P = { { 0x1.007ce8f01b2e8p+4, 0x1.60b8fe375999ep-2 }, + { -0x1.6b23cc5c6c6d7p+6, -0x1.779bb9bef7c0fp+1 }, + { 0x1.74e5f6ceb3548p+7, 0x1.786ea384470a2p+3 }, + { -0x1.5200bb15cc6bbp+7, -0x1.6a7c1453c85d3p+4 }, + { 0x1.05d193233a849p+6, 0x1.31f0fc5613142p+4 }, + { -0x1.148c5474ee5e1p+3, -0x1.5ea6c007d4dbbp+2 }, + { 0x1.689181bbafd0cp-3, 0x1.e66f265ce9e5p-3 } }, + .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 }, + { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 }, + { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 }, + { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 }, + { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 }, + { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 }, + { 0x1p+0, -0x1.4075c56404eecp+3 } }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2 }, + .log_tbl = SV_LOG_CONSTANTS +}; + +static inline svfloat64_t +special (svbool_t pg, svfloat64_t x, const struct data *d) +{ + /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1p_inline helper on -abs(x) - note that erfinv(inf) + will still be finite. */ + svfloat64_t ax = svabs_x (pg, x); + svfloat64_t t + = svneg_x (pg, sv_log_inline (pg, svsubr_x (pg, ax, 1), &d->log_tbl)); + t = svdivr_x (pg, svsqrt_x (pg, t), 1); + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (ax), svreinterpret_u64 (x)); + svfloat64_t ts + = svreinterpret_f64 (svorr_x (pg, sign, svreinterpret_u64 (t))); + + svfloat64_t q = svadd_x (pg, t, d->Q_57[8]); + for (int i = 7; i >= 0; i--) + q = svmad_x (pg, q, t, d->Q_57[i]); + + return svdiv_x (pg, sv_horner_8_f64_x (pg, t, d->P_57), svmul_x (pg, ts, q)); +} + +static inline svfloat64_t +lookup (const double *c, svuint64_t idx) +{ + svfloat64_t x = svld1rq_f64 (svptrue_b64 (), c); + return svtbl (x, idx); +} + +static inline svfloat64_t +notails (svbool_t pg, svfloat64_t x, const struct data *d) +{ + svfloat64_t t = svmad_x (pg, x, x, -0.5625); + svfloat64_t p = svmla_x (pg, sv_f64 (d->P[5][0]), t, d->P[6][0]); + svfloat64_t q = svadd_x (pg, t, d->Q[5][0]); + for (int i = 4; i >= 0; i--) + { + p = svmad_x (pg, t, p, d->P[i][0]); + q = svmad_x (pg, t, q, d->Q[i][0]); + } + p = svmul_x (pg, p, x); + return svdiv_x (pg, p, q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in double precision. Largest observed error is 24.75 ULP: + _ZGVsMxv_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0 + want 0x1.ea0547268660cp+0. */ +svfloat64_t SV_NAME_D1 (erfinv) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + + svbool_t no_tail = svacle (pg, x, 0.75); + if (unlikely (!svptest_any (pg, svnot_z (pg, no_tail)))) + return notails (pg, x, d); + + svbool_t is_tail = svnot_z (pg, no_tail); + svbool_t extreme_tail = svacgt (pg, x, 0.9375); + svuint64_t idx = svdup_n_u64_z (is_tail, 1); + + svfloat64_t t = svsel_f64 (is_tail, sv_f64 (d->tailshift), sv_f64 (-0.5625)); + t = svmla_x (pg, t, x, x); + + svfloat64_t p = lookup (&d->P[6][0], idx); + svfloat64_t q + = svmla_x (pg, lookup (&d->Q[6][0], idx), svdup_n_f64_z (is_tail, 1), t); + for (int i = 5; i >= 0; i--) + { + p = svmla_x (pg, lookup (&d->P[i][0], idx), p, t); + q = svmla_x (pg, lookup (&d->Q[i][0], idx), q, t); + } + p = svmad_m (is_tail, p, t, d->P37_0); + p = svmul_x (pg, p, x); + + if (likely (svptest_any (pg, extreme_tail))) + return svsel (extreme_tail, special (pg, x, d), svdiv_x (pg, p, q)); + return svdiv_x (pg, p, q); +} + +#if USE_MPFR +# warning Not generating tests for _ZGVsMxv_erfinv, as MPFR has no suitable reference +#else +TEST_SIG (SV, D, 1, erfinv, -0.99, 0.99) +TEST_ULP (SV_NAME_D1 (erfinv), 24.5) +TEST_DISABLE_FENV (SV_NAME_D1 (erfinv)) +/* Test with control lane in each interval. */ +TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 100000) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/experimental/sve/erfinvf_5u.c b/math/aarch64/experimental/sve/erfinvf_5u.c new file mode 100644 index 000000000000..2c81c4e0b9a2 --- /dev/null +++ b/math/aarch64/experimental/sve/erfinvf_5u.c @@ -0,0 +1,156 @@ +/* + * Single-precision inverse error function (SVE variant). + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" +#include "sv_logf_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N + is the coeffs of the numerator in table N of Blair et al, and + Q_N is the coeffs of the denominator. Coefficients stored in + interleaved format to support lookup scheme. */ + float P10_2, P29_3, Q10_2, Q29_2; + float P10_0, P29_1, P10_1, P29_2; + float Q10_0, Q29_0, Q10_1, Q29_1; + float P29_0, P_50[6], Q_50[2], tailshift; + struct sv_logf_data logf_tbl; +} data = { .P10_0 = -0x1.a31268p+3, + .P10_1 = 0x1.ac9048p+4, + .P10_2 = -0x1.293ff6p+3, + .P29_0 = -0x1.fc0252p-4, + .P29_1 = 0x1.119d44p+0, + .P29_2 = -0x1.f59ee2p+0, + .P29_3 = 0x1.b13626p-2, + .Q10_0 = -0x1.8265eep+3, + .Q10_1 = 0x1.ef5eaep+4, + .Q10_2 = -0x1.12665p+4, + .Q29_0 = -0x1.69952p-4, + .Q29_1 = 0x1.c7b7d2p-1, + .Q29_2 = -0x1.167d7p+1, + .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1, + -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 }, + .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0 }, + .tailshift = -0.87890625, + .logf_tbl = SV_LOGF_CONSTANTS }; + +static inline svfloat32_t +special (svbool_t pg, svfloat32_t x, const struct data *d) +{ + svfloat32_t ax = svabs_x (pg, x); + svfloat32_t t = svdivr_x ( + pg, + svsqrt_x (pg, svneg_x (pg, sv_logf_inline (pg, svsubr_x (pg, ax, 1), + &d->logf_tbl))), + 1); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (ax), svreinterpret_u32 (x)); + svfloat32_t ts + = svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (t))); + svfloat32_t q + = svmla_x (pg, sv_f32 (d->Q_50[0]), svadd_x (pg, t, d->Q_50[1]), t); + return svdiv_x (pg, sv_horner_5_f32_x (pg, t, d->P_50), svmul_x (pg, ts, q)); +} + +static inline svfloat32_t +notails (svbool_t pg, svfloat32_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + svfloat32_t t = svmad_x (pg, x, x, -0.5625); + svfloat32_t q = svadd_x (pg, t, d->Q10_2); + q = svmad_x (pg, t, q, d->Q10_1); + q = svmad_x (pg, t, q, d->Q10_0); + + svfloat32_t p = svmla_x (pg, sv_f32 (d->P10_1), t, d->P10_2); + p = svmad_x (pg, p, t, d->P10_0); + + return svdiv_x (pg, svmul_x (pg, x, p), q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Worst-case error is 4.71 ULP, in the + tail region: + _ZGVsMxv_erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0 + want 0x1.b83274p+0. */ +svfloat32_t SV_NAME_F1 (erfinv) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. */ + + /* Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + svbool_t is_tail = svacge (pg, x, 0.75); + svbool_t extreme_tail = svacge (pg, x, 0.9375); + + if (likely (!svptest_any (pg, is_tail))) + return notails (pg, x, d); + + /* Select requisite shift depending on interval: polynomial is evaluated on + x * x - shift. + Normal shift = 0.5625 + Tail shift = 0.87890625. */ + svfloat32_t t = svmla_x ( + pg, svsel (is_tail, sv_f32 (d->tailshift), sv_f32 (-0.5625)), x, x); + + svuint32_t idx = svdup_u32_z (is_tail, 1); + svuint32_t idxhi = svadd_x (pg, idx, 2); + + /* Load coeffs in quadwords and select them according to interval. */ + svfloat32_t pqhi = svld1rq (svptrue_b32 (), &d->P10_2); + svfloat32_t plo = svld1rq (svptrue_b32 (), &d->P10_0); + svfloat32_t qlo = svld1rq (svptrue_b32 (), &d->Q10_0); + + svfloat32_t p2 = svtbl (pqhi, idx); + svfloat32_t p1 = svtbl (plo, idxhi); + svfloat32_t p0 = svtbl (plo, idx); + svfloat32_t q0 = svtbl (qlo, idx); + svfloat32_t q1 = svtbl (qlo, idxhi); + svfloat32_t q2 = svtbl (pqhi, idxhi); + + svfloat32_t p = svmla_x (pg, p1, p2, t); + p = svmla_x (pg, p0, p, t); + /* Tail polynomial has higher order - merge with normal lanes. */ + p = svmad_m (is_tail, p, t, d->P29_0); + svfloat32_t y = svmul_x (pg, x, p); + + /* Least significant term of both Q polynomials is 1, so no need to generate + it. */ + svfloat32_t q = svadd_x (pg, t, q2); + q = svmla_x (pg, q1, q, t); + q = svmla_x (pg, q0, q, t); + + if (unlikely (svptest_any (pg, extreme_tail))) + return svsel (extreme_tail, special (extreme_tail, x, d), + svdiv_x (pg, y, q)); + return svdiv_x (pg, y, q); +} + +#if USE_MPFR +# warning Not generating tests for _ZGVsMxv_erfinvf, as MPFR has no suitable reference +#else +TEST_SIG (SV, F, 1, erfinv, -0.99, 0.99) +TEST_ULP (SV_NAME_F1 (erfinv), 4.09) +TEST_DISABLE_FENV (SV_NAME_F1 (erfinv)) +TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 40000) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/experimental/sve/powi.c b/math/aarch64/experimental/sve/powi.c new file mode 100644 index 000000000000..62dd1b114970 --- /dev/null +++ b/math/aarch64/experimental/sve/powi.c @@ -0,0 +1,49 @@ +/* + * Double-precision SVE powi(x, n) function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +/* Optimized double-precision vector powi (double base, long integer power). + powi is developed for environments in which accuracy is of much less + importance than performance, hence we provide no estimate for worst-case + error. */ +svfloat64_t +_ZGVsMxvv_powk (svfloat64_t as, svint64_t ns, svbool_t p) +{ + /* Compute powi by successive squaring, right to left. */ + svfloat64_t acc = sv_f64 (1.0); + svbool_t want_recip = svcmplt (p, ns, 0); + svuint64_t ns_abs = svreinterpret_u64 (svabs_x (p, ns)); + + /* We use a max to avoid needing to check whether any lane != 0 on each + iteration. */ + uint64_t max_n = svmaxv (p, ns_abs); + + svfloat64_t c = as; + /* Successively square c, and use merging predication (_m) to determine + whether or not to perform the multiplication or keep the previous + iteration. */ + while (true) + { + svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1ull), 1ull); + acc = svmul_m (px, acc, c); + max_n >>= 1; + if (max_n == 0) + break; + + ns_abs = svlsr_x (p, ns_abs, 1); + c = svmul_x (p, c, c); + } + + /* Negative powers are handled by computing the abs(n) version and then + taking the reciprocal. */ + if (svptest_any (want_recip, want_recip)) + acc = svdivr_m (want_recip, acc, 1.0); + + return acc; +} +CLOSE_SVE_ATTR diff --git a/math/aarch64/experimental/sve/powif.c b/math/aarch64/experimental/sve/powif.c new file mode 100644 index 000000000000..fd74acf12df7 --- /dev/null +++ b/math/aarch64/experimental/sve/powif.c @@ -0,0 +1,49 @@ +/* + * Single-precision SVE powi(x, n) function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +/* Optimized single-precision vector powi (float base, integer power). + powi is developed for environments in which accuracy is of much less + importance than performance, hence we provide no estimate for worst-case + error. */ +svfloat32_t +_ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p) +{ + /* Compute powi by successive squaring, right to left. */ + svfloat32_t acc = sv_f32 (1.f); + svbool_t want_recip = svcmplt (p, ns, 0); + svuint32_t ns_abs = svreinterpret_u32 (svabs_x (p, ns)); + + /* We use a max to avoid needing to check whether any lane != 0 on each + iteration. */ + uint32_t max_n = svmaxv (p, ns_abs); + + svfloat32_t c = as; + /* Successively square c, and use merging predication (_m) to determine + whether or not to perform the multiplication or keep the previous + iteration. */ + while (true) + { + svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1), 1); + acc = svmul_m (px, acc, c); + max_n >>= 1; + if (max_n == 0) + break; + + ns_abs = svlsr_x (p, ns_abs, 1); + c = svmul_x (p, c, c); + } + + /* Negative powers are handled by computing the abs(n) version and then + taking the reciprocal. */ + if (svptest_any (want_recip, want_recip)) + acc = svdivr_m (want_recip, acc, 1.0f); + + return acc; +} +CLOSE_SVE_ATTR diff --git a/math/aarch64/experimental/sve/sv_logf_inline.h b/math/aarch64/experimental/sve/sv_logf_inline.h new file mode 100644 index 000000000000..c317a23f6fc3 --- /dev/null +++ b/math/aarch64/experimental/sve/sv_logf_inline.h @@ -0,0 +1,51 @@ +/* + * Single-precision vector log function - inline version + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +struct sv_logf_data +{ + float p1, p3, p5, p6, p0, p2, p4; + float ln2; + uint32_t off, mantissa_mask; +}; + +#define SV_LOGF_CONSTANTS \ + { \ + .p0 = -0x1.ffffc8p-2f, .p1 = 0x1.555d7cp-2f, .p2 = -0x1.00187cp-2f, \ + .p3 = 0x1.961348p-3f, .p4 = -0x1.4f9934p-3f, .p5 = 0x1.5a9aa2p-3f, \ + .p6 = -0x1.3e737cp-3f, .ln2 = 0x1.62e43p-1f, .off = 0x3f2aaaab, \ + .mantissa_mask = 0x007fffff \ + } + +static inline svfloat32_t +sv_logf_inline (svbool_t pg, svfloat32_t x, const struct sv_logf_data *d) +{ + svuint32_t u = svreinterpret_u32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = svsub_x (pg, u, d->off); + svfloat32_t n = svcvt_f32_s32_x ( + pg, svasr_x (pg, svreinterpret_s32_u32 (u), 23)); /* signextend. */ + u = svand_x (pg, u, d->mantissa_mask); + u = svadd_x (pg, u, d->off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); + + /* y = log(1+r) + n*ln2. */ + svfloat32_t r2 = svmul_x (pg, r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + svfloat32_t p1356 = svld1rq_f32 (svptrue_b32 (), &d->p1); + svfloat32_t p = svmla_lane (sv_f32 (d->p4), r, p1356, 2); + svfloat32_t q = svmla_lane (sv_f32 (d->p2), r, p1356, 1); + svfloat32_t y = svmla_lane (sv_f32 (d->p0), r, p1356, 0); + p = svmla_lane (p, r2, p1356, 3); + q = svmla_x (pg, q, p, r2); + y = svmla_x (pg, y, q, r2); + p = svmla_x (pg, r, n, d->ln2); + + return svmla_x (pg, p, y, r2); +} diff --git a/math/aarch64/experimental/tanf_3u3.c b/math/aarch64/experimental/tanf_3u3.c new file mode 100644 index 000000000000..c26e92db588f --- /dev/null +++ b/math/aarch64/experimental/tanf_3u3.c @@ -0,0 +1,185 @@ +/* + * Single-precision scalar tan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f32.h" + +/* Useful constants. */ +#define NegPio2_1 (-0x1.921fb6p+0f) +#define NegPio2_2 (0x1.777a5cp-25f) +#define NegPio2_3 (0x1.ee59dap-50f) +/* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */ +#define RangeVal (0x1p17f) +#define InvPio2 ((0x1.45f306p-1f)) +#define Shift (0x1.8p+23f) +#define AbsMask (0x7fffffff) +#define Pio4 (0x1.921fb6p-1) +/* 2PI * 2^-64. */ +#define Pio2p63 (0x1.921FB54442D18p-62) + +static inline float +eval_P (float z) +{ + return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan); +} + +static inline float +eval_Q (float z) +{ + return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan); +} + +/* Reduction of the input argument x using Cody-Waite approach, such that x = r + + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */ +static inline float +reduce (float x, int32_t *in) +{ + /* n = rint(x/(pi/2)). */ + float r = x; + float q = fmaf (InvPio2, r, Shift); + float n = q - Shift; + /* There is no rounding here, n is representable by a signed integer. */ + *in = (int32_t) n; + /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */ + r = fmaf (NegPio2_1, n, r); + r = fmaf (NegPio2_2, n, r); + r = fmaf (NegPio2_3, n, r); + return r; +} + +/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. + XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). + Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. + Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit + multiply computes the exact 2.62-bit fixed-point modulo. Since the result + can have at most 29 leading zeros after the binary point, the double + precision result is accurate to 33 bits. */ +static inline double +reduce_large (uint32_t xi, int *np) +{ + const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15]; + int shift = (xi >> 23) & 7; + uint64_t n, res0, res1, res2; + + xi = (xi & 0xffffff) | 0x800000; + xi <<= shift; + + res0 = xi * arr[0]; + res1 = (uint64_t) xi * arr[4]; + res2 = (uint64_t) xi * arr[8]; + res0 = (res2 >> 32) | (res0 << 32); + res0 += res1; + + n = (res0 + (1ULL << 61)) >> 62; + res0 -= n << 62; + double x = (int64_t) res0; + *np = n; + return x * Pio2p63; +} + +/* Top 12 bits of the float representation with the sign bit cleared. */ +static inline uint32_t +top12 (float x) +{ + return (asuint (x) >> 20); +} + +/* Fast single-precision tan implementation. + Maximum ULP error: 3.293ulps. + tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */ +float +tanf (float x) +{ + /* Get top words. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + + /* Dispatch between no reduction (small numbers), fast reduction and + slow large numbers reduction. The reduction step determines r float + (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */ + int32_t n; + float r; + if (ia12 < top12 (Pio4)) + { + /* Optimize small values. */ + if (unlikely (ia12 < top12 (0x1p-12f))) + { + if (unlikely (ia12 < top12 (0x1p-126f))) + /* Force underflow for tiny x. */ + force_eval_float (x * x); + return x; + } + + /* tan (x) ~= x + x^3 * P(x^2). */ + float x2 = x * x; + float y = eval_P (x2); + return fmaf (x2, x * y, x); + } + /* Similar to other trigonometric routines, fast inaccurate reduction is + performed for values of x from pi/4 up to RangeVal. In order to keep + errors below 3.5ulps, we set the value of RangeVal to 2^17. This might + differ for other trigonometric routines. Above this value more advanced + but slower reduction techniques need to be implemented to reach a similar + accuracy. */ + else if (ia12 < top12 (RangeVal)) + { + /* Fast inaccurate reduction. */ + r = reduce (x, &n); + } + else if (ia12 < 0x7f8) + { + /* Slow accurate reduction. */ + uint32_t sign = ix & ~AbsMask; + double dar = reduce_large (ia, &n); + float ar = (float) dar; + r = asfloat (asuint (ar) ^ sign); + } + else + { + /* tan(Inf or NaN) is NaN. */ + return __math_invalidf (x); + } + + /* If x lives in an interval where |tan(x)| + - is finite then use an approximation of tangent in the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use an approximation of cotangent in the form + cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early. + Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r), + we only need to change the sign of r to obtain tan(x) from cotan(r). + This 2-interval approach requires 2 different sets of coefficients P and + Q, where Q is a lower order polynomial than P. */ + + /* Determine if x lives in an interval where |tan(x)| grows to infinity. */ + uint32_t alt = (uint32_t) n & 1; + + /* Perform additional reduction if required. */ + float z = alt ? -r : r; + + /* Prepare backward transformation. */ + float z2 = r * r; + float offset = alt ? 1.0f / z : z; + float scale = alt ? z : z * z2; + + /* Evaluate polynomial approximation of tan or cotan. */ + float p = alt ? eval_Q (z2) : eval_P (z2); + + /* A unified way of assembling the result on both interval types. */ + return fmaf (scale, p, offset); +} + +TEST_SIG (S, F, 1, tan, -3.1, 3.1) +TEST_ULP (tanf, 2.80) +TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) +TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000) +TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000) +TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000) +TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) +TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000) diff --git a/math/aarch64/experimental/tanf_data.c b/math/aarch64/experimental/tanf_data.c new file mode 100644 index 000000000000..f310cd77d4ec --- /dev/null +++ b/math/aarch64/experimental/tanf_data.c @@ -0,0 +1,45 @@ +/* + * Data used in single-precision tan(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct tanf_poly_data __tanf_poly_data = { +.poly_tan = { +/* Coefficients generated using: + poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a*a;b*b]); + optimize relative error + final prec : 23 bits + deg : 5 + a : 0x1p-126 ^ 2 + b : ((pi) / 0x1p2) ^ 2 + dirty rel error: 0x1.f7c2e4p-25 + dirty abs error: 0x1.f7c2ecp-25. */ +0x1.55555p-2, +0x1.11166p-3, +0x1.b88a78p-5, +0x1.7b5756p-6, +0x1.4ef4cep-8, +0x1.0e1e74p-7 +}, +.poly_cotan = { +/* Coefficients generated using: + fpminimax(f(x) = (0x1p0 / tan(sqrt(x)) - 0x1p0 / sqrt(x)) / sqrt(x), deg, [|dtype ...|], [a;b]) + optimize a single polynomial + optimize absolute error + final prec : 23 bits + working prec : 128 bits + deg : 3 + a : 0x1p-126 + b : (pi) / 0x1p2 + dirty rel error : 0x1.81298cp-25 + dirty abs error : 0x1.a8acf4p-25. */ +-0x1.55555p-2, /* -0.33333325. */ +-0x1.6c23e4p-6, /* -2.2225354e-2. */ +-0x1.12dbap-9, /* -2.0969994e-3. */ +-0x1.05a1c2p-12, /* -2.495116e-4. */ +} +}; diff --git a/math/aarch64/experimental/tanh_3u.c b/math/aarch64/experimental/tanh_3u.c new file mode 100644 index 000000000000..838b6c4f12c1 --- /dev/null +++ b/math/aarch64/experimental/tanh_3u.c @@ -0,0 +1,80 @@ +/* + * Double-precision tanh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "poly_scalar_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffffffffffff +#define InvLn2 0x1.71547652b82fep0 +#define Ln2hi 0x1.62e42fefa39efp-1 +#define Ln2lo 0x1.abc9e3b39803fp-56 +#define Shift 0x1.8p52 + +/* asuint64 (0x1.241bf835f9d5fp+4). */ +#define BoringBound 0x403241bf835f9d5f +/* asuint64 (0x1p-27). */ +#define TinyBound 0x3e40000000000000 +#define One 0x3ff0000000000000 + +static inline double +expm1_inline (double x) +{ + /* Helper routine for calculating exp(x) - 1. Copied from expm1_2u5.c, with + several simplifications: + - No special-case handling for tiny or special values. + - Simpler combination of p and t in final stage of the algorithm. + - Use shift-and-add instead of ldexp to calculate t. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + double j = fma (InvLn2, x, Shift) - Shift; + int64_t i = j; + double f = fma (j, -Ln2hi, x); + f = fma (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. */ + double f2 = f * f; + double f4 = f2 * f2; + double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f); + + /* t = 2 ^ i. */ + double t = asdouble ((uint64_t) (i + 1023) << 52); + /* expm1(x) = p * t + (t - 1). */ + return fma (p, t, t - 1); +} + +/* Approximation for double-precision tanh(x), using a simplified version of + expm1. The greatest observed error is 2.77 ULP: + tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ +double +tanh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t sign = ix & ~AbsMask; + + if (unlikely (ia > BoringBound)) + { + if (ia > 0x7ff0000000000000) + return __math_invalid (x); + return asdouble (One | sign); + } + + if (unlikely (ia < TinyBound)) + return x; + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + double q = expm1_inline (2 * x); + return q / (q + 2); +} + +TEST_SIG (S, D, 1, tanh, -10.0, 10.0) +TEST_ULP (tanh, 2.27) +TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000) +TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000) +TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000) diff --git a/math/aarch64/experimental/tanhf_2u6.c b/math/aarch64/experimental/tanhf_2u6.c new file mode 100644 index 000000000000..d9adae5c3a76 --- /dev/null +++ b/math/aarch64/experimental/tanhf_2u6.c @@ -0,0 +1,87 @@ +/* + * Single-precision tanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ +#define BoringBound 0x41102cb3 +#define AbsMask 0x7fffffff +#define One 0x3f800000 + +#define Shift (0x1.8p23f) +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) + +#define C(i) __expm1f_poly[i] + +static inline float +expm1f_inline (float x) +{ + /* Helper routine for calculating exp(x) - 1. + Copied from expm1f_1u6.c, with several simplifications: + - No special-case handling for tiny or special values, instead return + early from the main routine. + - No special handling for large values: + - No early return for infinity. + - Simpler combination of p and t in final stage of algorithm. + - |i| < 27, so can calculate t by simpler shift-and-add, instead of + ldexpf (same as vector algorithm). */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + float j = fmaf (InvLn2, x, Shift) - Shift; + int32_t i = j; + float f = fmaf (j, -Ln2hi, x); + f = fmaf (j, -Ln2lo, f); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). + Uses Estrin scheme, where the main expm1f routine uses Horner. */ + float f2 = f * f; + float p_01 = fmaf (f, C (1), C (0)); + float p_23 = fmaf (f, C (3), C (2)); + float p = fmaf (f2, p_23, p_01); + p = fmaf (f2 * f2, C (4), p); + p = fmaf (f2, p, f); + + /* t = 2^i. */ + float t = asfloat ((uint32_t) (i + 127) << 23); + /* expm1(x) ~= p * t + (t - 1). */ + return fmaf (p, t, t - 1); +} + +/* Approximation for single-precision tanh(x), using a simplified version of + expm1f. The maximum error is 2.58 ULP: + tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float +tanhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & ~AbsMask; + + if (unlikely (iax > BoringBound)) + { + if (iax > 0x7f800000) + return __math_invalidf (x); + return asfloat (One | sign); + } + + if (unlikely (iax < 0x34000000)) + return x; + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float q = expm1f_inline (2 * x); + return q / (q + 2); +} + +TEST_SIG (S, F, 1, tanh, -10.0, 10.0) +TEST_ULP (tanhf, 2.09) +TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) +TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) diff --git a/math/aarch64/sincospi_4u.c b/math/aarch64/sincospi_4u.c new file mode 100644 index 000000000000..2a944bed23e1 --- /dev/null +++ b/math/aarch64/sincospi_4u.c @@ -0,0 +1,158 @@ +/* + * Double-precision scalar sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +const static struct sincospi_data +{ + double poly[10]; +} sincospi_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, +}; + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint64_t +abstop12 (double x) +{ + return (asuint64 (x) >> 52) & 0x7ff; +} + +/* Triages special cases into 4 categories: + -1 or +1 if iy represents half an integer + -1 if round(y) is odd. + +1 if round(y) is even. + -2 or +2 if iy represents and integer. + -2 if iy is odd. + +2 if iy is even. + The argument is the bit representation of a positive non-zero + finite floating-point value which is either a half or an integer. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + { + if ((iy - 1) & 2) + return -1; + else + return 1; + } + if (iy & (1 << (0x3ff + 52 - e))) + return -2; + return 2; +} + +/* Approximation for scalar double-precision sincospi(x). + Maximum error for sin: 3.46 ULP: + sincospif_sin(0x1.3d8a067cd8961p+14) got 0x1.ffe609a279008p-1 want + 0x1.ffe609a27900cp-1. + Maximum error for cos: 3.66 ULP: + sincospif_cos(0x1.a0ec6997557eep-24) got 0x1.ffffffffffe59p-1 want + 0x1.ffffffffffe5dp-1. */ +void +arm_math_sincospi (double x, double *out_sin, double *out_cos) +{ + const struct sincospi_data *d = ptr_barrier (&sincospi_data); + uint64_t sign = asuint64 (x) & 0x8000000000000000; + + if (likely (abstop12 (x) < abstop12 (0x1p51))) + { + /* ax = |x| - n (range reduction into -1/2 .. 1/2). */ + double ar_s = x - rint (x); + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + ax = 0.5 - |x - rint(x)|. */ + double ar_c = 0.5 - fabs (ar_s); + + /* ss = sin(pi * ax). */ + double ar2_s = ar_s * ar_s; + double ar2_c = ar_c * ar_c; + double ar4_s = ar2_s * ar2_s; + double ar4_c = ar2_c * ar2_c; + + uint64_t cc_sign = ((uint64_t) llrint (x)) << 63; + uint64_t ss_sign = cc_sign; + if (ar_s == 0) + ss_sign = sign; + + double ss = pw_horner_9_f64 (ar2_s, ar4_s, d->poly); + double cc = pw_horner_9_f64 (ar2_c, ar4_c, d->poly); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) + always be positive, therefore, the sign must be introduced + based upon if x rounds to odd or even. For sin(x) the sign is + copied from x. */ + *out_sin + = asdouble (asuint64 (fma (-4 * ar2_s, ar_s, ss * ar_s)) ^ ss_sign); + *out_cos + = asdouble (asuint64 (fma (-4 * ar2_c, ar_c, cc * ar_c)) ^ cc_sign); + } + else + { + /* When abs(x) > 0x1p51, the x will be either + - Half integer (relevant if abs(x) in [0x1p51, 0x1p52]) + - Odd integer (relevant if abs(x) in [0x1p52, 0x1p53]) + - Even integer (relevant if abs(x) in [0x1p53, inf]) + - Inf or NaN. */ + if (abstop12 (x) >= 0x7ff) + { + double inv_result = __math_invalid (x); + *out_sin = inv_result; + *out_cos = inv_result; + return; + } + else + { + uint64_t ax = asuint64 (x) & 0x7fffffffffffffff; + int m = checkint (ax); + /* The case where ax is half integer. */ + if (m & 1) + { + *out_sin = sign ? -m : m; + *out_cos = 0; + return; + } + /* The case where ax is integer. */ + else + { + *out_sin = asdouble (sign); + *out_cos = m >> 1; + return; + } + } + } +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (arm_math_sincospi_sin) +TEST_DISABLE_FENV (arm_math_sincospi_cos) +TEST_ULP (arm_math_sincospi_sin, 2.96) +TEST_ULP (arm_math_sincospi_cos, 3.16) +# define SINCOS_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospi_cos, lo, hi, n) +SINCOS_INTERVAL (0, 0x1p-63, 10000) +SINCOS_INTERVAL (0x1p-63, 0.5, 50000) +SINCOS_INTERVAL (0.5, 0x1p51, 50000) +SINCOS_INTERVAL (0x1p51, inf, 10000) +#endif diff --git a/math/aarch64/sincospif_3u2.c b/math/aarch64/sincospif_3u2.c new file mode 100644 index 000000000000..b79694d2ac65 --- /dev/null +++ b/math/aarch64/sincospif_3u2.c @@ -0,0 +1,145 @@ +/* + * Single-precision scalar sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f32.h" + +/* Taylor series coefficents for sin(pi * x). */ +const static struct sincospif_data +{ + float poly[6]; +} sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, +}; + +/* Top 12 bits of the float representation with the sign bit cleared. */ +static inline uint32_t +abstop12 (float x) +{ + return (asuint (x) >> 20) & 0x7ff; +} + +/* Triages special cases into 4 categories: + -1 or +1 if iy represents half an integer + -1 if round(y) is odd. + +1 if round(y) is even. + -2 or +2 if iy represents and integer. + -2 if iy is odd. + +2 if iy is even. + The argument is the bit representation of a positive non-zero + finite floating-point value which is either a half or an integer. */ +static inline int +checkint (uint32_t iy) +{ + int e = iy >> 23; + if (e > 0x7f + 23) + return 2; + if (iy & ((1 << (0x7f + 23 - e)) - 1)) + { + if ((iy - 1) & 2) + return -1; + else + return 1; + } + if (iy & (1 << (0x7f + 23 - e))) + return -2; + return 2; +} + +/* Approximation for scalar single-precision sincospif(x). + Maximum error for sin: 3.04 ULP: + sincospif_sin(0x1.c597ccp-2) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Maximum error for cos: 3.18 ULP: + sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. */ +void +arm_math_sincospif (float x, float *out_sin, float *out_cos) +{ + + const struct sincospif_data *d = ptr_barrier (&sincospif_data); + uint32_t sign = asuint (x) & 0x80000000; + + /* abs(x) in [0, 0x1p22]. */ + if (likely (abstop12 (x) < abstop12 (0x1p22))) + { + /* ar_s = x - n (range reduction into -1/2 .. 1/2). */ + float ar_s = x - rintf (x); + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + ar_c = 0.5 - |x - n|. */ + float ar_c = 0.5f - fabsf (ar_s); + + float ar2_s = ar_s * ar_s; + float ar2_c = ar_c * ar_c; + float ar4_s = ar2_s * ar2_s; + float ar4_c = ar2_c * ar2_c; + + uint32_t cc_sign = lrintf (x) << 31; + uint32_t ss_sign = cc_sign; + if (ar_s == 0) + ss_sign = sign; + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) + always be positive, therefore, the sign must be introduced + based upon if x rounds to odd or even. For sin(x) the sign is + copied from x. */ + *out_sin = pw_horner_5_f32 (ar2_s, ar4_s, d->poly) + * asfloat (asuint (ar_s) ^ ss_sign); + *out_cos = pw_horner_5_f32 (ar2_c, ar4_c, d->poly) + * asfloat (asuint (ar_c) ^ cc_sign); + return; + } + else + { + /* When abs(x) > 0x1p22, the x will be either + - Half integer (relevant if abs(x) in [0x1p22, 0x1p23]) + - Odd integer (relevant if abs(x) in [0x1p22, 0x1p24]) + - Even integer (relevant if abs(x) in [0x1p22, inf]) + - Inf or NaN. */ + if (abstop12 (x) >= 0x7f8) + { + float inv_result = __math_invalidf (x); + *out_sin = inv_result; + *out_cos = inv_result; + return; + } + else + { + uint32_t ax = asuint (x) & 0x7fffffff; + int m = checkint (ax); + if (m & 1) + { + *out_sin = sign ? -m : m; + *out_cos = 0; + return; + } + else + { + *out_sin = asfloat (sign); + *out_cos = m >> 1; + return; + } + } + } +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (arm_math_sincospif_sin) +TEST_DISABLE_FENV (arm_math_sincospif_cos) +TEST_ULP (arm_math_sincospif_sin, 2.54) +TEST_ULP (arm_math_sincospif_cos, 2.68) +# define SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospif_cos, lo, hi, n) +SINCOSPIF_INTERVAL (0, 0x1p-31, 10000) +SINCOSPIF_INTERVAL (0x1p-31, 1, 50000) +SINCOSPIF_INTERVAL (1, 0x1p22f, 50000) +SINCOSPIF_INTERVAL (0x1p22f, inf, 10000) +#endif diff --git a/math/aarch64/sinpi_3u5.c b/math/aarch64/sinpi_3u5.c new file mode 100644 index 000000000000..f96d9a312b53 --- /dev/null +++ b/math/aarch64/sinpi_3u5.c @@ -0,0 +1,101 @@ +/* + * Double-precision scalar sinpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include <math.h> +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +static const double poly[] + = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, + -0x1.012a9870eeb7dp-25 }; + +#define Shift 0x1.8p+52 +/* TODO Store constant in structure for more efficient load. */ +#define Pi 0x1.921fb54442d18p+1 + +/* Approximation for scalar double-precision sinpi(x). + Maximum error: 3.03 ULP: + sinpi(0x1.a90da2818f8b5p+7) got 0x1.fe358f255a4b3p-1 + want 0x1.fe358f255a4b6p-1. */ +double +arm_math_sinpi (double x) +{ + if (isinf (x) || isnan (x)) + return __math_invalid (x); + + double r = asdouble (asuint64 (x) & ~0x8000000000000000); + uint64_t sign = asuint64 (x) & 0x8000000000000000; + + /* Edge cases for when sinpif should be exactly 0. (Integers) + 0x1p53 is the limit for single precision to store any decimal places. */ + if (r >= 0x1p53) + return asdouble (sign); + + /* If x is an integer, return 0. */ + uint64_t m = (uint64_t) r; + if (r == m) + return asdouble (sign); + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via sinpi(x) ≈ pi*x. */ + if (r < 0x1p-63) + return Pi * x; + + /* Any non-integer values >= 0x1x51 will be int + 0.5. + These values should return exactly 1 or -1. */ + if (r >= 0x1p51) + { + uint64_t iy = ((m & 1) << 63) ^ asuint64 (1.0); + return asdouble (sign ^ iy); + } + + /* n = rint(|x|). */ + double n = r + Shift; + sign ^= (asuint64 (n) << 63); + n = n - Shift; + + /* r = |x| - n (range reduction into -1/2 .. 1/2). */ + r = r - n; + + /* y = sin(r). */ + double r2 = r * r; + double y = horner_9_f64 (r2, poly); + y = y * r; + + /* Reintroduce C2_hi. */ + y = fma (-4 * r2, r, y); + + /* Copy sign of x to sin(|x|). */ + return asdouble (asuint64 (y) ^ sign); +} + +#if WANT_EXPERIMENTAL_MATH +double +sinpi (double x) +{ + return arm_math_sinpi (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_sinpi, 2.53) +TEST_SYM_INTERVAL (arm_math_sinpi, 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (arm_math_sinpi, 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_sinpi, 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (arm_math_sinpi, 0x1p51, inf, 10000) +#endif diff --git a/math/aarch64/sinpif_2u5.c b/math/aarch64/sinpif_2u5.c new file mode 100644 index 000000000000..b5d9cd914577 --- /dev/null +++ b/math/aarch64/sinpif_2u5.c @@ -0,0 +1,92 @@ +/* + * Single-precision scalar sinpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Taylor series coefficents for sin(pi * x). */ +#define C0 0x1.921fb6p1f +#define C1 -0x1.4abbcep2f +#define C2 0x1.466bc6p1f +#define C3 -0x1.32d2ccp-1f +#define C4 0x1.50783p-4f +#define C5 -0x1.e30750p-8f + +#define Shift 0x1.0p+23f + +/* Approximation for scalar single-precision sinpi(x) - sinpif. + Maximum error: 2.48 ULP: + sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +float +arm_math_sinpif (float x) +{ + if (isinf (x) || isnan (x)) + return __math_invalidf (x); + + float r = asfloat (asuint (x) & ~0x80000000); + uint32_t sign = asuint (x) & 0x80000000; + + /* Edge cases for when sinpif should be exactly 0. (Integers) + 0x1p23 is the limit for single precision to store any decimal places. */ + if (r >= 0x1p23f) + return asfloat (sign); + + int32_t m = roundf (r); + if (m == r) + return asfloat (sign); + + /* For very small inputs, squaring r causes underflow. + Values below this threshold can be approximated via sinpi(x) ~= pi*x. */ + if (r < 0x1p-31f) + return C0 * x; + + /* Any non-integer values >= 0x1p22f will be int + 0.5. + These values should return exactly 1 or -1. */ + if (r >= 0x1p22f) + { + uint32_t iy = ((m & 1) << 31) ^ asuint (-1.0f); + return asfloat (sign ^ iy); + } + + /* n = rint(|x|). */ + float n = r + Shift; + sign ^= (asuint (n) << 31); + n = n - Shift; + + /* r = |x| - n (range reduction into -1/2 .. 1/2). */ + r = r - n; + + /* y = sin(pi * r). */ + float r2 = r * r; + float y = fmaf (C5, r2, C4); + y = fmaf (y, r2, C3); + y = fmaf (y, r2, C2); + y = fmaf (y, r2, C1); + y = fmaf (y, r2, C0); + + /* Copy sign of x to sin(|x|). */ + return asfloat (asuint (y * r) ^ sign); +} + +#if WANT_EXPERIMENTAL_MATH +float +sinpif (float x) +{ + return arm_math_sinpif (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_sinpif, 1.99) +TEST_SYM_INTERVAL (arm_math_sinpif, 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (arm_math_sinpif, 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_sinpif, 0.5, 0x1p22f, 10000) +TEST_SYM_INTERVAL (arm_math_sinpif, 0x1p22f, inf, 10000) +#endif diff --git a/math/aarch64/sve/acos.c b/math/aarch64/sve/acos.c new file mode 100644 index 000000000000..da633392aa3e --- /dev/null +++ b/math/aarch64/sve/acos.c @@ -0,0 +1,93 @@ +/* + * Double-precision SVE acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64_t poly[12]; + float64_t pi, pi_over_2; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, + 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, + 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, + .pi = 0x1.921fb54442d18p+1, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; + +/* Double-precision SVE implementation of vector acos(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0 + want 0x1.0d4d0f55667f7p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1 + want 0x1.ed82df4243f0bp-1. */ +svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); + + svbool_t a_gt_half = svacgt (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat64_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + svfloat64_t y + = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign)); + + svbool_t is_neg = svcmplt (pg, x, 0.0); + svfloat64_t off = svdup_f64_z (is_neg, d->pi); + svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0)); + svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2)); + + return svmla_x (pg, add, mul, y); +} + +TEST_SIG (SV, D, 1, acos, -1.0, 1.0) +TEST_ULP (SV_NAME_D1 (acos), 1.02) +TEST_DISABLE_FENV (SV_NAME_D1 (acos)) +TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/acosf.c b/math/aarch64/sve/acosf.c new file mode 100644 index 000000000000..86b7822cefc3 --- /dev/null +++ b/math/aarch64/sve/acosf.c @@ -0,0 +1,86 @@ +/* + * Single-precision SVE acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32_t poly[5]; + float32_t pi, pi_over_2; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, + 0x1.3af7d8p-5, }, + .pi = 0x1.921fb6p+1f, + .pi_over_2 = 0x1.921fb6p+0f, +}; + +/* Single-precision SVE implementation of vector acos(x). + + For |x| in [0, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.16 ulps, + _ZGVsMxv_acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 + want 0x1.0c27f6p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVsMxv_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000); + svfloat32_t ax = svabs_x (pg, x); + svbool_t a_gt_half = svacgt (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat32_t z = svsqrt_m (ax, a_gt_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + svfloat32_t y + = svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (p), sign)); + + svbool_t is_neg = svcmplt (pg, x, 0.0); + svfloat32_t off = svdup_f32_z (is_neg, d->pi); + svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0), sv_f32 (-1.0)); + svfloat32_t add = svsel (a_gt_half, off, sv_f32 (d->pi_over_2)); + + return svmla_x (pg, add, mul, y); +} + +TEST_SIG (SV, F, 1, acos, -1.0, 1.0) +TEST_ULP (SV_NAME_F1 (acos), 0.82) +TEST_DISABLE_FENV (SV_NAME_F1 (acos)) +TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/acosh.c b/math/aarch64/sve/acosh.c new file mode 100644 index 000000000000..d54c21922e1b --- /dev/null +++ b/math/aarch64/sve/acosh.c @@ -0,0 +1,51 @@ +/* + * Double-precision SVE acosh(x) function. + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 1 +#include "sv_log1p_inline.h" + +#define One (0x3ff0000000000000) +#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */ + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (acosh, x, y, special); +} + +/* SVE approximation for double-precision acosh, based on log1p. + The largest observed error is 3.19 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 + want 0x1.ed23399f51373p-2. */ +svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) +{ + /* (ix - One) >= (BigBound - One). */ + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); + + svfloat64_t xm1 = svsub_x (pg, x, 1.0); + svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0)); + svfloat64_t y = svadd_x (pg, xm1, svsqrt_x (pg, u)); + + /* Fall back to scalar routine for special lanes. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, sv_log1p_inline (y, pg), special); + return sv_log1p_inline (y, pg); +} + +TEST_SIG (SV, D, 1, acosh, 1.0, 10.0) +TEST_ULP (SV_NAME_D1 (acosh), 2.69) +TEST_DISABLE_FENV (SV_NAME_D1 (acosh)) +TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000) +TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000) +TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000) +TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/acoshf.c b/math/aarch64/sve/acoshf.c new file mode 100644 index 000000000000..f48ef724e8eb --- /dev/null +++ b/math/aarch64/sve/acoshf.c @@ -0,0 +1,51 @@ +/* + * Single-precision SVE acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define One 0x3f800000 +#define Thres 0x20000000 /* asuint(0x1p64) - One. */ + +#include "sv_log1pf_inline.h" + +static svfloat32_t NOINLINE +special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special) +{ + svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f); + svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ()); + return sv_call_f32 (acoshf, x, y, special); +} + +/* Single-precision SVE acosh(x) routine. Implements the same algorithm as + vector acoshf and log1p. + + Maximum error is 2.47 ULPs: + SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4 + want 0x1.e435a2p-4. */ +svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) +{ + svuint32_t ix = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); + + svfloat32_t xm1 = svsub_x (pg, x, 1.0f); + svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); + svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u)); + + if (unlikely (svptest_any (pg, special))) + return special_case (xm1, tmp, special); + return sv_log1pf_inline (tmp, pg); +} + +TEST_SIG (SV, F, 1, acosh, 1.0, 10.0) +TEST_ULP (SV_NAME_F1 (acosh), 1.97) +TEST_DISABLE_FENV (SV_NAME_F1 (acosh)) +TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500) +TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000) +TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000) +TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/asin.c b/math/aarch64/sve/asin.c new file mode 100644 index 000000000000..cac629afae15 --- /dev/null +++ b/math/aarch64/sve/asin.c @@ -0,0 +1,86 @@ +/* + * Double-precision SVE asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64_t poly[12]; + float64_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, + 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6, + 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, + 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, + 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6, + -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, + .pi_over_2f = 0x1.921fb54442d18p+0, +}; + +#define P(i) sv_f64 (d->poly[i]) + +/* Double-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.52 ulps, + _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2 + want 0x1.ec13757305f26p-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVsMxv_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ +svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t ax = svabs_x (pg, x); + svbool_t a_ge_half = svacge (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat64_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); + svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +TEST_SIG (SV, D, 1, asin, -1.0, 1.0) +TEST_ULP (SV_NAME_D1 (asin), 2.20) +TEST_DISABLE_FENV (SV_NAME_D1 (asin)) +TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/asinf.c b/math/aarch64/sve/asinf.c new file mode 100644 index 000000000000..fe94feba7a42 --- /dev/null +++ b/math/aarch64/sve/asinf.c @@ -0,0 +1,78 @@ +/* + * Single-precision SVE asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32_t poly[5]; + float32_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, + 0x1.3af7d8p-5, }, + .pi_over_2f = 0x1.921fb6p+0f, +}; + +/* Single-precision SVE implementation of vector asin(x). + + For |x| in [0, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVsMxv_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 + want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVsMxv_asinf (-0x1.00203ep-1) got -0x1.0c3a64p-1 + want -0x1.0c3a6p-1. */ +svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000); + + svfloat32_t ax = svabs_x (pg, x); + svbool_t a_ge_half = svacge (pg, x, 0.5); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5), + svmul_x (pg, x, x)); + svfloat32_t z = svsqrt_m (ax, a_ge_half, z2); + + /* Use a single polynomial approximation P for both intervals. */ + svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat32_t y = svmad_m (a_ge_half, p, sv_f32 (-2.0), d->pi_over_2f); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +TEST_SIG (SV, F, 1, asin, -1.0, 1.0) +TEST_ULP (SV_NAME_F1 (asin), 1.91) +TEST_DISABLE_FENV (SV_NAME_F1 (asin)) +TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/asinh.c b/math/aarch64/sve/asinh.c new file mode 100644 index 000000000000..5574116de1e1 --- /dev/null +++ b/math/aarch64/sve/asinh.c @@ -0,0 +1,197 @@ +/* + * Double-precision SVE asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define SignMask (0x8000000000000000) +#define One (0x3ff0000000000000) +#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */ +#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1) + +static const struct data +{ + double even_coeffs[9]; + double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17; + uint64_t off, mask; + +} data = { + /* Polynomial generated using Remez on [2^-26, 1]. */ + .even_coeffs ={ + -0x1.55555555554a7p-3, + -0x1.6db6db68332e6p-5, + -0x1.6e8b8b654a621p-6, + -0x1.c9871d10885afp-7, + -0x1.3ddca533e9f54p-7, + -0x1.b90c7099dd397p-8, + -0x1.d217026a669ecp-9, + -0x1.e0f37daef9127p-11, + -0x1.021a48685e287p-14, }, + + .c1 = 0x1.3333333326c7p-4, + .c3 = 0x1.f1c71b26fb40dp-6, + .c5 = 0x1.1c4daa9e67871p-6, + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c9 = 0x1.0becef748dafcp-7, + .c11 = 0x1.541f2bb1ffe51p-8, + .c13 = 0x1.0b5c7977aaf7p-9, + .c15 = 0x1.388b5fe542a6p-12, + .c17 = 0x1.93d4ba83d34dap-18, + + .ln2 = 0x1.62e42fefa39efp-1, + .p0 = -0x1.ffffffffffff7p-2, + .p1 = 0x1.55555555170d4p-2, + .p2 = -0x1.0000000399c27p-2, + .p3 = 0x1.999b2e90e94cap-3, + .p4 = -0x1.554e550bd501ep-3, + .off = 0x3fe6900900000000, + .mask = 0xfffULL << 52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (asinh, x, y, special); +} + +static inline svfloat64_t +__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) +{ + /* Double-precision SVE log, copied from SVE log implementation with some + cosmetic modification and special-cases removed. See that file for details + of the algorithm used. */ + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t i_off = svsub_x (pg, ix, d->off); + svuint64_t i + = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask)); + svfloat64_t z = svreinterpret_f64 (iz); + + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + svfloat64_t ln2_p3 = svld1rq (svptrue_b64 (), &d->ln2); + svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1); + + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); + svfloat64_t kd + = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52)); + + svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1); + svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0); + + y = svmla_lane (y, r2, p1_p4, 1); + y = svmla_x (pg, p, r2, y); + y = svmla_x (pg, hi, r2, y); + return y; +} + +/* Double-precision implementation of SVE asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.51 ULP, in + |x| >= 1: + _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1 + want 0x1.e3181c43b0f39p-1. */ +svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iax = svbic_x (pg, ix, SignMask); + svuint64_t sign = svand_x (pg, ix, SignMask); + svfloat64_t ax = svreinterpret_f64 (iax); + svbool_t ge1 = svcmpge (pg, iax, One); + svbool_t special = svcmpge (pg, iax, Thres); + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */ + svfloat64_t option_1 = sv_f64 (0); + if (likely (svptest_any (pg, ge1))) + { + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); + option_1 = __sv_log_inline ( + svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + The largest observed error in this region is 1.51 ULPs: + _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 + want 0x1.c1e649ee2681dp-1. */ + + svfloat64_t option_2 = sv_f64 (0); + if (likely (svptest_any (pg, svnot_z (pg, ge1)))) + { + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); + svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2); + /* Order-17 Pairwise Horner scheme. */ + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + + svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1); + svfloat64_t p1213 + = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0); + svfloat64_t p1415 + = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1); + svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17); + + svfloat64_t p = svmla_x (pg, p1415, x4, p1617); + p = svmla_x (pg, p1213, x4, p); + p = svmla_x (pg, p1011, x4, p); + p = svmla_x (pg, p89, x4, p); + + p = svmla_x (pg, p67, x4, p); + p = svmla_x (pg, p45, x4, p); + + p = svmla_x (pg, p23, x4, p); + + p = svmla_x (pg, p01, x4, p); + + option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax)); + } + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, + svreinterpret_f64 (sveor_x ( + pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)), + special); + + /* Choose the right option for each lane. */ + svfloat64_t y = svsel (ge1, option_1, option_2); + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +TEST_SIG (SV, D, 1, asinh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (asinh), 2.52) +TEST_DISABLE_FENV (SV_NAME_D1 (asinh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0, 0x1p-26, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0x1p-26, 1, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 1, 0x1p511, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0x1p511, inf, 40000) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the v_sel is choosing the right option in all cases. */ +TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 0.5) +TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 2) +TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 0x1p600) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/asinhf.c b/math/aarch64/sve/asinhf.c new file mode 100644 index 000000000000..32aedbfd3a6d --- /dev/null +++ b/math/aarch64/sve/asinhf.c @@ -0,0 +1,57 @@ +/* + * Single-precision SVE asinh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#include "sv_log1pf_inline.h" + +#define BigBound 0x5f800000 /* asuint(0x1p64). */ + +static svfloat32_t NOINLINE +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special) +{ + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); + y = svreinterpret_f32 ( + svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y))); + return sv_call_f32 (asinhf, x, y, special); +} + +/* Single-precision SVE asinh(x) routine. Implements the same algorithm as + vector asinhf and log1p. + + Maximum error is 1.92 ULPs: + SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2 + want -0x1.fd0bc8p-2. */ +svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) +{ + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svbool_t special = svcmpge (pg, iax, BigBound); + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + svfloat32_t ax2 = svmul_x (pg, ax, ax); + svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f); + svfloat32_t y + = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (iax, sign, y, special); + return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))); +} + +TEST_SIG (SV, F, 1, asinh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (asinh), 1.43) +TEST_DISABLE_FENV (SV_NAME_F1 (asinh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/atan.c b/math/aarch64/sve/atan.c new file mode 100644 index 000000000000..73fc29a94f23 --- /dev/null +++ b/math/aarch64/sve/atan.c @@ -0,0 +1,89 @@ +/* + * Double-precision vector atan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" + +static const struct data +{ + float64_t poly[20]; + float64_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; + +/* Useful constants. */ +#define SignMask (0x8000000000000000) + +/* Fast implementation of SVE atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed + error is 2.27 ulps: + _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* No need to trigger special case. Small cases, infs and nans + are supported by our approximation technique. */ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t sign = svand_x (pg, ix, SignMask); + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + svbool_t red = svacgt (pg, x, 1.0); + /* Avoid dependency in abs(x) in division (and comparison). */ + svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x); + /* Use absolute value only when needed (odd powers of z). */ + svfloat64_t az = svabs_x (pg, z); + az = svneg_m (az, red, az); + + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); + svfloat64_t x2 = svmul_x (pg, z2, z2); + svfloat64_t x4 = svmul_x (pg, x2, x2); + svfloat64_t x8 = svmul_x (pg, x4, x4); + + svfloat64_t y + = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly), + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t z3 = svmul_x (pg, z2, az); + y = svmla_x (pg, az, z3, y); + + /* Apply shift as indicated by `red` predicate. */ + y = svadd_m (red, y, d->pi_over_2); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); + + return y; +} + +TEST_SIG (SV, D, 1, atan, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (atan), 1.78) +TEST_DISABLE_FENV (SV_NAME_D1 (atan)) +TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/atan2.c b/math/aarch64/sve/atan2.c new file mode 100644 index 000000000000..1e1d00678b1d --- /dev/null +++ b/math/aarch64/sve/atan2.c @@ -0,0 +1,118 @@ +/* + * Double-precision vector atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" + +static const struct data +{ + float64_t poly[20]; + float64_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, + .pi_over_2 = 0x1.921fb54442d18p+0, +}; + +/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ +static svfloat64_t NOINLINE +special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, + const svbool_t cmp) +{ + return sv_call2_f64 (atan2, y, x, ret, cmp); +} + +/* Returns a predicate indicating true if the input is the bit representation + of 0, infinity or nan. */ +static inline svbool_t +zeroinfnan (svuint64_t i, const svbool_t pg) +{ + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1), + sv_u64 (2 * asuint64 (INFINITY) - 1)); +} + +/* Fast implementation of SVE atan2. Errors are greatest when y and + x are reasonably close together. The greatest observed error is 2.28 ULP: + _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) + got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, + const svbool_t pg) +{ + const struct data *data_ptr = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); + + svbool_t cmp_x = zeroinfnan (ix, pg); + svbool_t cmp_y = zeroinfnan (iy, pg); + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); + + svfloat64_t ax = svabs_x (pg, x); + svfloat64_t ay = svabs_x (pg, y); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t iay = svreinterpret_u64 (ay); + + svuint64_t sign_x = sveor_x (pg, ix, iax); + svuint64_t sign_y = sveor_x (pg, iy, iay); + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); + + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); + + /* Set up z for call to atan. */ + svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat64_t d = svsel (pred_aygtax, ay, ax); + svfloat64_t z = svdiv_x (pg, n, d); + + /* Work out the correct shift. */ + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f64 (1.0), shift); + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); + shift = svmul_x (pg, shift, data_ptr->pi_over_2); + + /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ + svfloat64_t z2 = svmul_x (pg, z, z); + svfloat64_t x2 = svmul_x (pg, z2, z2); + svfloat64_t x4 = svmul_x (pg, x2, x2); + svfloat64_t x8 = svmul_x (pg, x4, x4); + + svfloat64_t ret = svmla_x ( + pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly), + sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t z3 = svmul_x (pg, z2, z); + ret = svmla_x (pg, z, z3, ret); + + ret = svadd_m (pg, ret, shift); + + /* Account for the sign of x and y. */ + if (unlikely (svptest_any (pg, cmp_xy))) + return special_case ( + y, x, + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)), + cmp_xy); + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (SV, D, 2, atan2) +TEST_ULP (SV_NAME_D2 (atan2), 1.78) +TEST_DISABLE_FENV (SV_NAME_D2 (atan2)) +TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/atan2f.c b/math/aarch64/sve/atan2f.c new file mode 100644 index 000000000000..563b708cfcbb --- /dev/null +++ b/math/aarch64/sve/atan2f.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector atan2f(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +static const struct data +{ + float32_t poly[8]; + float32_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, + .pi_over_2 = 0x1.921fb6p+0f, +}; + +/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ +static svfloat32_t NOINLINE +special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, + const svbool_t cmp) +{ + return sv_call2_f32 (atan2f, y, x, ret, cmp); +} + +/* Returns a predicate indicating true if the input is the bit representation + of 0, infinity or nan. */ +static inline svbool_t +zeroinfnan (svuint32_t i, const svbool_t pg) +{ + return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1), + sv_u32 (2 * 0x7f800000lu - 1)); +} + +/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * + P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum + observed error is 2.95 ULP: + _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, + const svbool_t pg) +{ + const struct data *data_ptr = ptr_barrier (&data); + + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); + + svbool_t cmp_x = zeroinfnan (ix, pg); + svbool_t cmp_y = zeroinfnan (iy, pg); + svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); + + svfloat32_t ax = svabs_x (pg, x); + svfloat32_t ay = svabs_x (pg, y); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t iay = svreinterpret_u32 (ay); + + svuint32_t sign_x = sveor_x (pg, ix, iax); + svuint32_t sign_y = sveor_x (pg, iy, iay); + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); + + svbool_t pred_aygtax = svcmpgt (pg, ay, ax); + + /* Set up z for call to atan. */ + svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat32_t d = svsel (pred_aygtax, ay, ax); + svfloat32_t z = svdiv_x (pg, n, d); + + /* Work out the correct shift. */ + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f32 (1.0), shift); + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); + shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); + + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); + + svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly); + + /* ret = shift + z + z^3 * P(z^2). */ + svfloat32_t z3 = svmul_x (pg, z2, z); + ret = svmla_x (pg, z, z3, ret); + + ret = svadd_m (pg, ret, shift); + + /* Account for the sign of x and y. */ + + if (unlikely (svptest_any (pg, cmp_xy))) + return special_case ( + y, x, + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)), + cmp_xy); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (SV, F, 2, atan2) +TEST_ULP (SV_NAME_F2 (atan2), 2.45) +TEST_DISABLE_FENV (SV_NAME_F2 (atan2)) +TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/atanf.c b/math/aarch64/sve/atanf.c new file mode 100644 index 000000000000..a2cd37b12744 --- /dev/null +++ b/math/aarch64/sve/atanf.c @@ -0,0 +1,78 @@ +/* + * Single-precision vector atan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +static const struct data +{ + float32_t poly[8]; + float32_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. */ + .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, + .pi_over_2 = 0x1.921fb6p+0f, +}; + +#define SignMask (0x80000000) + +/* Fast implementation of SVE atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=-1/x and shift = pi/2. + Largest observed error is 2.9 ULP, close to +/-1.0: + _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 + want -0x1.967fp-1. */ +svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* No need to trigger special case. Small cases, infs and nans + are supported by our approximation technique. */ + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t sign = svand_x (pg, ix, SignMask); + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + svbool_t red = svacgt (pg, x, 1.0f); + /* Avoid dependency in abs(x) in division (and comparison). */ + svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x); + /* Use absolute value only when needed (odd powers of z). */ + svfloat32_t az = svabs_x (pg, z); + az = svneg_m (az, red, az); + + /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t z8 = svmul_x (pg, z4, z4); + + svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat32_t z3 = svmul_x (pg, z2, az); + y = svmla_x (pg, az, z3, y); + + /* Apply shift as indicated by 'red' predicate. */ + y = svadd_m (red, y, sv_f32 (d->pi_over_2)); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +TEST_SIG (SV, F, 1, atan, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (atan), 2.9) +TEST_DISABLE_FENV (SV_NAME_F1 (atan)) +TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/atanh.c b/math/aarch64/sve/atanh.c new file mode 100644 index 000000000000..b404df56fd75 --- /dev/null +++ b/math/aarch64/sve/atanh.c @@ -0,0 +1,62 @@ +/* + * Double-precision SVE atanh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 0 +#include "sv_log1p_inline.h" + +#define One (0x3ff0000000000000) +#define Half (0x3fe0000000000000) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (atanh, x, y, special); +} + +/* SVE approximation for double-precision atanh, based on log1p. + The greatest observed error is 2.81 ULP: + _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ +svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) +{ + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); + + /* It is special if iax >= 1. */ + svbool_t special = svacge (pg, x, 1.0); + + /* Computation is performed based on the following sequence of equality: + (1+x)/(1-x) = 1 + 2x/(1-x). */ + svfloat64_t y; + y = svadd_x (pg, ax, ax); + y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax)); + /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ + y = sv_log1p_inline (y, pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, halfsign, y), special); + return svmul_x (pg, halfsign, y); +} + +TEST_SIG (SV, D, 1, atanh, -1.0, 1.0) +TEST_ULP (SV_NAME_D1 (atanh), 3.32) +TEST_DISABLE_FENV (SV_NAME_D1 (atanh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 1, inf, 100) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (SV_NAME_D1 (atanh), 0) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/atanhf.c b/math/aarch64/sve/atanhf.c new file mode 100644 index 000000000000..2e10a8cd22f7 --- /dev/null +++ b/math/aarch64/sve/atanhf.c @@ -0,0 +1,61 @@ +/* + * Single-precision vector atanh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#include "sv_log1pf_inline.h" + +#define One (0x3f800000) +#define Half (0x3f000000) + +static svfloat32_t NOINLINE +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign, + svfloat32_t y, svbool_t special) +{ + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); + y = svmul_x (svptrue_b32 (), halfsign, y); + return sv_call_f32 (atanhf, x, y, special); +} + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 1.99 ULP: + _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5 + want 0x1.f1f4f6p-5. */ +svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) +{ + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half)); + svbool_t special = svcmpge (pg, iax, One); + + /* Computation is performed based on the following sequence of equality: + * (1+x)/(1-x) = 1 + 2x/(1-x). */ + svfloat32_t y = svadd_x (pg, ax, ax); + y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax)); + /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ + y = sv_log1pf_inline (y, pg); + + if (unlikely (svptest_any (pg, special))) + return special_case (iax, sign, halfsign, y, special); + + return svmul_x (pg, halfsign, y); +} + +TEST_SIG (SV, F, 1, atanh, -1.0, 1.0) +TEST_ULP (SV_NAME_F1 (atanh), 1.50) +TEST_DISABLE_FENV (SV_NAME_F1 (atanh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000) +TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000) +TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 1, inf, 1000) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (SV_NAME_F1 (atanh), 0) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cbrt.c b/math/aarch64/sve/cbrt.c new file mode 100644 index 000000000000..3e6a972463f0 --- /dev/null +++ b/math/aarch64/sve/cbrt.c @@ -0,0 +1,135 @@ +/* + * Double-precision SVE cbrt(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" + +const static struct data +{ + float64_t poly[4]; + float64_t table[5]; + float64_t one_third, two_thirds, shift; + int64_t exp_bias; + uint64_t tiny_bound, thresh; +} data = { + /* Generated with FPMinimax in [0.5, 1]. */ + .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, + 0x1.2c74eaa3ba428p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, }, + .one_third = 0x1.5555555555555p-2, + .two_thirds = 0x1.5555555555555p-1, + .shift = 0x1.8p52, + .exp_bias = 1022, + .tiny_bound = 0x0010000000000000, /* Smallest normal. */ + .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */ +}; + +#define MantissaMask 0x000fffffffffffff +#define HalfExp 0x3fe0000000000000 + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cbrt, x, y, special); +} + +static inline svfloat64_t +shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. + + The vector version of frexp does not handle subnormals + correctly. As a result these need to be handled by the scalar + fallback, where accuracy may be worse than that of the vector code + path. + + Greatest observed error in the normal range is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value m + * 2^e will be observed for any input m * 2^(e + 3*i), where i is an integer. + _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342 + want 0x1.965f53b0e5d95p-342. */ +svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat64_t m = svreinterpret_f64 (svorr_x ( + pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp)); + svint64_t e + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + svfloat64_t p + = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third); + svint64_t ey = svcvt_s64_x (pg, eb3f); + svint64_t em3 = svmls_x (pg, e, ey, 3); + + svfloat64_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexp. */ + svfloat64_t y = svscale_x (pg, my, ey); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which + has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error + in the vector path is 1.79 ULP. + [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical + Functions in Single, Double, Double Extended, and Quadruple Precision. */ +TEST_SIG (SV, D, 1, cbrt, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (cbrt), 3.17) +TEST_DISABLE_FENV (SV_NAME_D1 (cbrt)) +TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cbrtf.c b/math/aarch64/sve/cbrtf.c new file mode 100644 index 000000000000..afdace7865f1 --- /dev/null +++ b/math/aarch64/sve/cbrtf.c @@ -0,0 +1,118 @@ +/* + * Single-precision SVE cbrt(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +const static struct data +{ + float32_t poly[4]; + float32_t table[5]; + float32_t one_third, two_thirds; +} data = { + /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax. + */ + .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, + 0x1.2c74c2p-3, }, + /* table[i] = 2^((i - 2) / 3). */ + .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, +}; + +#define SmallestNormal 0x00800000 +#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask 0x007fffff +#define HalfExp 0x3f000000 + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +{ + return sv_call_f32 (cbrtf, x, y, special); +} + +static inline svfloat32_t +shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i) +{ + return svld1_gather_index (pg, table, svadd_x (pg, i, 2)); +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + + /* Subnormal, +/-0 and special values. */ + svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + svfloat32_t m = svreinterpret_f32 (svorr_x ( + pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp)); + svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + svfloat32_t p + = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third); + svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p, + d->two_thirds); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third); + svint32_t ey = svcvt_s32_x (pg, ef); + svint32_t em3 = svmls_x (pg, e, ey, 3); + + svfloat32_t my = shifted_lookup (pg, d->table, em3); + my = svmul_x (pg, my, a); + + /* Vector version of ldexpf. */ + svfloat32_t y = svscale_x (pg, my, ey); + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)), + special); + + /* Copy sign. */ + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +TEST_SIG (SV, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (cbrt), 1.15) +TEST_DISABLE_FENV (SV_NAME_F1 (cbrt)) +TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cexpi.c b/math/aarch64/sve/cexpi.c new file mode 100644 index 000000000000..0ccd110484c8 --- /dev/null +++ b/math/aarch64/sve/cexpi.c @@ -0,0 +1,48 @@ +/* + * Double-precision vector cexpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_sincos_common.h" +#include "test_defs.h" + +static svfloat64x2_t NOINLINE +special_case (svfloat64_t x, svbool_t special, svfloat64x2_t y) +{ + return svcreate2 (sv_call_f64 (sin, x, svget2 (y, 0), special), + sv_call_f64 (cos, x, svget2 (y, 1), special)); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + sv_cexpi_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +svfloat64x2_t +_ZGVsMxv_cexpi (svfloat64_t x, svbool_t pg) +{ + const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat64x2_t sc = sv_sincos_inline (pg, x, d); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special, sc); + return sc; +} + +TEST_DISABLE_FENV (_ZGVsMxv_cexpi_sin) +TEST_DISABLE_FENV (_ZGVsMxv_cexpi_cos) +TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73) +TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73) +#define SV_CEXPI_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n) +SV_CEXPI_INTERVAL (0, 0x1p23, 500000) +SV_CEXPI_INTERVAL (-0, -0x1p23, 500000) +SV_CEXPI_INTERVAL (0x1p23, inf, 10000) +SV_CEXPI_INTERVAL (-0x1p23, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cexpif.c b/math/aarch64/sve/cexpif.c new file mode 100644 index 000000000000..fd07ce553cd8 --- /dev/null +++ b/math/aarch64/sve/cexpif.c @@ -0,0 +1,50 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_sincosf_common.h" +#include "test_defs.h" + +static svfloat32x2_t NOINLINE +special_case (svfloat32_t x, svbool_t special, svfloat32x2_t y) +{ + return svcreate2 (sv_call_f32 (sinf, x, svget2 (y, 0), special), + sv_call_f32 (cosf, x, svget2 (y, 1), special)); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +svfloat32x2_t +_ZGVsMxv_cexpif (svfloat32_t x, svbool_t pg) +{ + const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat32x2_t sc = sv_sincosf_inline (pg, x, d); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special, sc); + return sc; +} + +TEST_DISABLE_FENV (_ZGVsMxv_cexpif_sin) +TEST_DISABLE_FENV (_ZGVsMxv_cexpif_cos) +TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17) +TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31) +#define SV_CEXPIF_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n) +SV_CEXPIF_INTERVAL (0, 0x1p20, 500000) +SV_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +SV_CEXPIF_INTERVAL (0x1p20, inf, 10000) +SV_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cos.c b/math/aarch64/sve/cos.c new file mode 100644 index 000000000000..93e93674a98a --- /dev/null +++ b/math/aarch64/sve/cos.c @@ -0,0 +1,88 @@ +/* + * Double-precision SVE cos(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double inv_pio2, pio2_1, pio2_2, pio2_3, shift; +} data = { + /* Polynomial coefficients are hardwired in FTMAD instructions. */ + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2_1 = 0x1.921fb50000000p+0, + .pio2_2 = 0x1.110b460000000p-26, + .pio2_3 = 0x1.1a62633145c07p-54, + /* Original shift used in AdvSIMD cos, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.8000000000001p52 +}; + +#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t oob) +{ + return sv_call_f64 (cos, x, y, oob); +} + +/* A fast SVE implementation of cos based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.108 ULPs. + SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 + want -0x1.fddd4c65c7f05p-3. */ +svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t r = svabs_x (pg, x); + svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svbool_t ptrue = svptrue_b64 (); + svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2); + svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2); + + /* n = rint(|x|/(pi/2)). */ + svfloat64_t q = svmla_lane (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); + svfloat64_t n = svsub_x (pg, q, d->shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = svmls_lane (r, n, invpio2_and_pio2_1, 1); + r = svmls_lane (r, n, pio2_23, 0); + r = svmls_lane (r, n, pio2_23, 1); + + /* cos(r) poly approx. */ + svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q)); + svfloat64_t y = sv_f64 (0.0); + y = svtmad (y, r2, 7); + y = svtmad (y, r2, 6); + y = svtmad (y, r2, 5); + y = svtmad (y, r2, 4); + y = svtmad (y, r2, 3); + y = svtmad (y, r2, 2); + y = svtmad (y, r2, 1); + y = svtmad (y, r2, 0); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + svfloat64_t f = svtssel (r, svreinterpret_u64 (q)); + + if (unlikely (svptest_any (pg, oob))) + return special_case (x, svmul_x (svnot_z (pg, oob), y, f), oob); + + /* Apply factor. */ + return svmul_x (pg, f, y); +} + +TEST_SIG (SV, D, 1, cos, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (cos), 1.61) +TEST_DISABLE_FENV (SV_NAME_D1 (cos)) +TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000) +TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cosf.c b/math/aarch64/sve/cosf.c new file mode 100644 index 000000000000..7d18f8c2ad21 --- /dev/null +++ b/math/aarch64/sve/cosf.c @@ -0,0 +1,82 @@ +/* + * Single-precision SVE cos(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift; +} data = { + /* Polynomial coefficients are hard-wired in FTMAD instructions. */ + .neg_pio2_1 = -0x1.921fb6p+0f, + .neg_pio2_2 = 0x1.777a5cp-25f, + .neg_pio2_3 = 0x1.ee59dap-50f, + .inv_pio2 = 0x1.45f306p-1f, + /* Original shift used in AdvSIMD cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.800002p+23f +}; + +#define RangeVal 0x49800000 /* asuint32(0x1p20f). */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t oob) +{ + return sv_call_f32 (cosf, x, y, oob); +} + +/* A fast SVE implementation of cosf based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.06 ULPs. + SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t r = svabs_x (pg, x); + svbool_t oob = svcmpge (pg, svreinterpret_u32 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t negpio2_and_invpio2 = svld1rq (svptrue_b32 (), &d->neg_pio2_1); + + /* n = rint(|x|/(pi/2)). */ + svfloat32_t q = svmla_lane (sv_f32 (d->shift), r, negpio2_and_invpio2, 3); + svfloat32_t n = svsub_x (pg, q, d->shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = svmla_lane (r, n, negpio2_and_invpio2, 0); + r = svmla_lane (r, n, negpio2_and_invpio2, 1); + r = svmla_lane (r, n, negpio2_and_invpio2, 2); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + svfloat32_t f = svtssel (r, svreinterpret_u32 (q)); + + /* cos(r) poly approx. */ + svfloat32_t r2 = svtsmul (r, svreinterpret_u32 (q)); + svfloat32_t y = sv_f32 (0.0f); + y = svtmad (y, r2, 4); + y = svtmad (y, r2, 3); + y = svtmad (y, r2, 2); + y = svtmad (y, r2, 1); + y = svtmad (y, r2, 0); + + if (unlikely (svptest_any (pg, oob))) + return special_case (x, svmul_x (svnot_z (pg, oob), f, y), oob); + /* Apply factor. */ + return svmul_x (pg, f, y); +} + +TEST_SIG (SV, F, 1, cos, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (cos), 1.57) +TEST_DISABLE_FENV (SV_NAME_F1 (cos)) +TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000) +TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cosh.c b/math/aarch64/sve/cosh.c new file mode 100644 index 000000000000..775854cfbe5a --- /dev/null +++ b/math/aarch64/sve/cosh.c @@ -0,0 +1,104 @@ +/* + * Double-precision SVE cosh(x) function. + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64_t poly[3]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; + uint64_t special_bound; +} data = { + .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, + 0x1.5555576a59599p-5, }, + + .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */ + /* -ln2/N. */ + .ln2_hi = -0x1.62e42fefa39efp-9, + .ln2_lo = -0x1.abc9e3b39803f3p-64, + .shift = 0x1.8p+52, + .thres = 704.0, + + /* 0x1.6p9, above which exp overflows. */ + .special_bound = 0x4086000000000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special) +{ + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + svfloat64_t y = svadd_x (pg, half_t, half_over_t); + return sv_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from sv_exp_tail, with no + special-case handling or tail. */ +static inline svfloat64_t +exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Calculate exp(x). */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svfloat64_t n = svsub_x (pg, z, d->shift); + + svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi); + r = svmla_x (pg, r, n, d->ln2_lo); + + svuint64_t u = svreinterpret_u64 (z); + svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); + svuint64_t i = svand_x (svptrue_b64 (), u, 0xff); + + svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); + y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); + y = svmla_x (pg, sv_f64 (1.0), r, y); + y = svmul_x (svptrue_b64 (), r, y); + + /* s = 2^(n/N). */ + u = svld1_gather_index (pg, __v_exp_tail_data, i); + svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e)); + + return svmla_x (pg, s, s, y); +} + +/* Approximation for SVE double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021 + want 0x1.fd774e958236fp+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8 + want 0x1.f5e2bb8d5c991p+8. */ +svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + svfloat64_t t = exp_inline (ax, pg, d); + + /* Fall back to scalar for any special cases. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg, t, special); + + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + return svadd_x (pg, half_t, half_over_t); +} + +TEST_SIG (SV, D, 1, cosh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (cosh), 1.43) +TEST_DISABLE_FENV (SV_NAME_D1 (cosh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000) +CLOSE_SVE_ATTR
\ No newline at end of file diff --git a/math/aarch64/sve/coshf.c b/math/aarch64/sve/coshf.c new file mode 100644 index 000000000000..b79fed2374b5 --- /dev/null +++ b/math/aarch64/sve/coshf.c @@ -0,0 +1,62 @@ +/* + * Single-precision SVE cosh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expf_inline.h" + +static const struct data +{ + struct sv_expf_data expf_consts; + float special_bound; +} data = { + .expf_consts = SV_EXPF_DATA, + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = 0x1.5a92d8p+6, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e, + svbool_t pg) +{ + return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e), + pg); +} + +/* Single-precision vector cosh, using vector expf. + Maximum error is 2.77 ULP: + _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2 + want 0x1.e4594cp+2. */ +svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svbool_t special = svacge (pg, x, d->special_bound); + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. + Note that x is passed to exp here, rather than |x|. This is to avoid using + destructive unary ABS for better register usage. However it means the + routine is not exactly symmetrical, as the exp helper is slightly less + accurate in the negative range. */ + svfloat32_t e = expf_inline (x, pg, &d->expf_consts); + svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5); + svfloat32_t half_over_e = svdivr_x (pg, e, 0.5); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, half_e, half_over_e, special); + + return svadd_x (svptrue_b32 (), half_e, half_over_e); +} + +TEST_SIG (SV, F, 1, cosh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (cosh), 2.28) +TEST_DISABLE_FENV (SV_NAME_F1 (cosh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cospi.c b/math/aarch64/sve/cospi.c new file mode 100644 index 000000000000..9859dbe7a44c --- /dev/null +++ b/math/aarch64/sve/cospi.c @@ -0,0 +1,66 @@ +/* + * Double-precision SVE cospi(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" + +static const struct data +{ + double poly[10]; + double range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p53, +}; + +/* A fast SVE implementation of cospi. + Maximum error 3.20 ULP: + _ZGVsMxv_cospi(0x1.f18ba32c63159p-6) got 0x1.fdabf595f9763p-1 + want 0x1.fdabf595f9766p-1. */ +svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f64 (0.5), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^53, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_D1 (cospi), 2.71) +TEST_DISABLE_FENV (SV_NAME_D1 (cospi)) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/cospif.c b/math/aarch64/sve/cospif.c new file mode 100644 index 000000000000..d65a2b619023 --- /dev/null +++ b/math/aarch64/sve/cospif.c @@ -0,0 +1,62 @@ +/* + * Single-precision SVE cospi(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +static const struct data +{ + float poly[6]; + float range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31f, +}; + +/* A fast SVE implementation of cospif. + Maximum error: 2.60 ULP: + _ZGVsMxv_cospif(+/-0x1.cae664p-4) got 0x1.e09c9ep-1 + want 0x1.e09c98p-1. */ +svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f32 (0.5f), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^31, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_F1 (cospi), 2.08) +TEST_DISABLE_FENV (SV_NAME_F1 (cospi)) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/erf.c b/math/aarch64/sve/erf.c new file mode 100644 index 000000000000..ccade93e1033 --- /dev/null +++ b/math/aarch64/sve/erf.c @@ -0,0 +1,115 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double third; + double tenth, two_over_five, two_over_fifteen; + double two_over_nine, two_over_fortyfive; + double max, shift; +} data = { + .third = 0x1.5555555555556p-2, /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = 0x1.1111111111111p-3, + .tenth = -0x1.999999999999ap-4, + .two_over_five = -0x1.999999999999ap-2, + .two_over_nine = -0x1.c71c71c71c71cp-3, + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, + .max = 5.9921875, /* 6 - 1/128. */ + .shift = 0x1p45, +}; + +#define SignMask (0x8000000000000000) + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + _ZGVsMxv_erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* |x| >= 6.0 - 1/128. Opposite conditions except none of them catch NaNs so + they can be used in lookup and BSLs to yield the expected results. */ + svbool_t a_ge_max = svacge (pg, x, dat->max); + svbool_t a_lt_max = svaclt (pg, x, dat->max); + + /* Set r to multiple of 1/128 nearest to |x|. */ + svfloat64_t a = svabs_x (pg, x); + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t z = svadd_x (pg, a, shift); + svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff); + i = svadd_x (pg, i, i); + + /* Lookup without shortcut for small values but with predicate to avoid + segfault for large values and NaNs. */ + svfloat64_t r = svsub_x (pg, z, shift); + svfloat64_t erfr + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i); + svfloat64_t scale + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + svfloat64_t d = svsub_x (pg, a, r); + svfloat64_t d2 = svmul_x (pg, d, d); + svfloat64_t r2 = svmul_x (pg, r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + svfloat64_t p1 = r; + svfloat64_t third = sv_f64 (dat->third); + svfloat64_t twothird = svmul_x (pg, third, 2.0); + svfloat64_t sixth = svmul_x (pg, third, 0.5); + svfloat64_t p2 = svmls_x (pg, third, r2, twothird); + svfloat64_t p3 = svmad_x (pg, r2, third, -0.5); + p3 = svmul_x (pg, r, p3); + svfloat64_t p4 + = svmla_x (pg, sv_f64 (dat->two_over_five), r2, dat->two_over_fifteen); + p4 = svmls_x (pg, sv_f64 (dat->tenth), r2, p4); + svfloat64_t p5 + = svmla_x (pg, sv_f64 (dat->two_over_nine), r2, dat->two_over_fortyfive); + p5 = svmla_x (pg, sixth, r2, p5); + p5 = svmul_x (pg, r, p5); + + svfloat64_t p34 = svmla_x (pg, p3, d, p4); + svfloat64_t p12 = svmla_x (pg, p1, d, p2); + svfloat64_t y = svmla_x (pg, p34, d2, p5); + y = svmla_x (pg, p12, d2, y); + + y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = svsel (a_ge_max, sv_f64 (1.0), y); + + /* Copy sign. */ + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iy = svreinterpret_u64 (y); + svuint64_t sign = svand_x (pg, ix, SignMask); + return svreinterpret_f64 (svorr_x (pg, sign, iy)); +} + +TEST_SIG (SV, D, 1, erf, -6.0, 6.0) +TEST_ULP (SV_NAME_D1 (erf), 1.79) +TEST_DISABLE_FENV (SV_NAME_D1 (erf)) +TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/erfc.c b/math/aarch64/sve/erfc.c new file mode 100644 index 000000000000..a85cacb1ae62 --- /dev/null +++ b/math/aarch64/sve/erfc.c @@ -0,0 +1,166 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64_t off_idx, off_arr; + double max, shift; + double p20, p40, p41, p42; + double p51, p52; + double q5, r5; + double q6, r6; + double q7, r7; + double q8, r8; + double q9, r9; + uint64_t table_scale; +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .off_idx = 0xbd3ffffffffff260, + .off_arr = 0xfffffffffffff260, /* 0xffffffffffffffff - 3487. */ + .max = 0x1.b3ep+4, /* 3487/128. */ + .shift = 0x1p45, + .table_scale = 0x37f0000000000000, /* asuint64(0x1p-128). */ + .p20 = 0x1.5555555555555p-2, /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = -0x1.999999999999ap-4, /* 1/10. */ + .p41 = -0x1.999999999999ap-2, /* 2/5. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ + .p51 = -0x1.c71c71c71c71cp-3, /* 2/9. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ + /* Qi = (i+1) / i, for i = 5, ..., 9. */ + .q5 = 0x1.3333333333333p0, + .q6 = 0x1.2aaaaaaaaaaabp0, + .q7 = 0x1.2492492492492p0, + .q8 = 0x1.2p0, + .q9 = 0x1.1c71c71c71c72p0, + /* Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .r5 = -0x1.e79e79e79e79ep-3, + .r6 = -0x1.b6db6db6db6dbp-3, + .r7 = -0x1.8e38e38e38e39p-3, + .r8 = -0x1.6c16c16c16c17p-3, + .r9 = -0x1.4f2094f2094f2p-3, +}; + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + _ZGVsMxv_erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + svfloat64_t a = svabs_x (pg, x); + + /* Clamp input at |x| <= 3487/128. */ + a = svmin_x (pg, a, dat->max); + + /* Reduce x to the nearest multiple of 1/128. */ + svfloat64_t shift = sv_f64 (dat->shift); + svfloat64_t z = svadd_x (pg, a, shift); + + /* Saturate index for the NaN case. */ + svuint64_t i = svqadd (svreinterpret_u64 (z), dat->off_idx); + + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ + i = svadd_x (pg, i, i); + const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr; + svfloat64_t erfcr = svld1_gather_index (pg, p, i); + svfloat64_t scale = svld1_gather_index (pg, p + 1, i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + svfloat64_t r = svsub_x (pg, z, shift); + svfloat64_t d = svsub_x (pg, a, r); + svfloat64_t d2 = svmul_x (pg, d, d); + svfloat64_t r2 = svmul_x (pg, r, r); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p9(r) * d^9. */ + svfloat64_t p1 = r; + svfloat64_t third = sv_f64 (dat->p20); + svfloat64_t twothird = svmul_x (pg, third, 2.0); + svfloat64_t sixth = svmul_x (pg, third, 0.5); + svfloat64_t p2 = svmls_x (pg, third, r2, twothird); + svfloat64_t p3 = svmad_x (pg, r2, third, -0.5); + p3 = svmul_x (pg, r, p3); + svfloat64_t p4 = svmla_x (pg, sv_f64 (dat->p41), r2, dat->p42); + p4 = svmls_x (pg, sv_f64 (dat->p40), r2, p4); + svfloat64_t p5 = svmla_x (pg, sv_f64 (dat->p51), r2, dat->p52); + p5 = svmla_x (pg, sixth, r2, p5); + p5 = svmul_x (pg, r, p5); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + svfloat64_t qr5 = svld1rq (svptrue_b64 (), &dat->q5); + svfloat64_t qr6 = svld1rq (svptrue_b64 (), &dat->q6); + svfloat64_t qr7 = svld1rq (svptrue_b64 (), &dat->q7); + svfloat64_t qr8 = svld1rq (svptrue_b64 (), &dat->q8); + svfloat64_t qr9 = svld1rq (svptrue_b64 (), &dat->q9); + svfloat64_t p6 = svmla_x (pg, p4, p5, svmul_lane (r, qr5, 0)); + p6 = svmul_lane (p6, qr5, 1); + svfloat64_t p7 = svmla_x (pg, p5, p6, svmul_lane (r, qr6, 0)); + p7 = svmul_lane (p7, qr6, 1); + svfloat64_t p8 = svmla_x (pg, p6, p7, svmul_lane (r, qr7, 0)); + p8 = svmul_lane (p8, qr7, 1); + svfloat64_t p9 = svmla_x (pg, p7, p8, svmul_lane (r, qr8, 0)); + p9 = svmul_lane (p9, qr8, 1); + svfloat64_t p10 = svmla_x (pg, p8, p9, svmul_lane (r, qr9, 0)); + p10 = svmul_lane (p10, qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + svfloat64_t p90 = svmla_x (pg, p9, d, p10); + svfloat64_t p78 = svmla_x (pg, p7, d, p8); + svfloat64_t p56 = svmla_x (pg, p5, d, p6); + svfloat64_t p34 = svmla_x (pg, p3, d, p4); + svfloat64_t p12 = svmla_x (pg, p1, d, p2); + svfloat64_t y = svmla_x (pg, p78, d2, p90); + y = svmla_x (pg, p56, d2, y); + y = svmla_x (pg, p34, d2, y); + y = svmla_x (pg, p12, d2, y); + + y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); + svfloat64_t off = svreinterpret_f64 (svlsr_x (pg, sign, 1)); + /* Handle sign and scale back in a single fma. */ + svfloat64_t fac = svreinterpret_f64 (svorr_x (pg, sign, dat->table_scale)); + + return svmla_x (pg, off, fac, y); +} + +TEST_SIG (SV, D, 1, erfc, -6.0, 28.0) +TEST_ULP (SV_NAME_D1 (erfc), 1.21) +TEST_DISABLE_FENV (SV_NAME_D1 (erfc)) +TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/erfcf.c b/math/aarch64/sve/erfcf.c new file mode 100644 index 000000000000..936881332291 --- /dev/null +++ b/math/aarch64/sve/erfcf.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint32_t off_idx, off_arr; + float max, shift; + float third, two_thirds, two_over_fifteen, two_over_five, tenth; +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .off_idx = 0xb7fffd7b, /* 0xffffffff - asuint(shift) - 644. */ + .off_arr = 0xfffffd7b, /* 0xffffffff - 644. */ + .max = 10.0625f, /* 644/64. */ + .shift = 0x1p17f, + .third = 0x1.555556p-2f, + .two_thirds = 0x1.555556p-1f, + .two_over_fifteen = 0x1.111112p-3f, + .two_over_five = -0x1.99999ap-2f, + .tenth = -0x1.99999ap-4f, +}; + +#define SignMask 0x80000000 +#define TableScale 0x28000000 /* 0x1p-47. */ + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVsMxv_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + svfloat32_t a = svabs_x (pg, x); + + /* Clamp input at |x| <= 10.0 + 4/64. */ + a = svmin_x (pg, a, dat->max); + + /* Reduce x to the nearest multiple of 1/64. */ + svfloat32_t shift = sv_f32 (dat->shift); + svfloat32_t z = svadd_x (pg, a, shift); + + /* Saturate index for the NaN case. */ + svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); + + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ + i = svlsl_x (svptrue_b32 (), i, 1); + const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr; + svfloat32_t erfcr = svld1_gather_index (pg, p, i); + svfloat32_t scale = svld1_gather_index (pg, p + 1, i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + svfloat32_t r = svsub_x (pg, z, shift); + svfloat32_t d = svsub_x (pg, a, r); + svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + + svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); + + svfloat32_t p1 = r; + svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1); + svfloat32_t p3 + = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); + svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); + p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); + + svfloat32_t y = svmla_x (pg, p3, d, p4); + y = svmla_x (pg, p2, d, y); + y = svmla_x (pg, p1, d, y); + + /* Solves the |x| = inf/nan case. */ + y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), SignMask); + svfloat32_t off = svreinterpret_f32 (svlsr_x (pg, sign, 1)); + /* Handle sign and scale back in a single fma. */ + svfloat32_t fac = svreinterpret_f32 (svorr_x (pg, sign, TableScale)); + + return svmla_x (pg, off, fac, y); +} + +TEST_SIG (SV, F, 1, erfc, -4.0, 10.0) +TEST_ULP (SV_NAME_F1 (erfc), 1.14) +TEST_DISABLE_FENV (SV_NAME_F1 (erfc)) +TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/erff.c b/math/aarch64/sve/erff.c new file mode 100644 index 000000000000..c8c87499a63f --- /dev/null +++ b/math/aarch64/sve/erff.c @@ -0,0 +1,91 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float min, max, scale, shift, third; +} data = { + .min = 0x1.cp-7f, /* 1/64 - 1/512. */ + .max = 3.9375, /* 4 - 8/128. */ + .scale = 0x1.20dd76p+0f, /* 2/sqrt(pi). */ + .shift = 0x1p16f, + .third = 0x1.555556p-2f, /* 1/3. */ +}; + +#define SignMask (0x80000000) + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| < 0x1.cp-7, the algorithm sets r = 0, erf(r) = 0, and scale = 2 / + sqrt(pi), so it simply boils down to a Taylor series expansion near 0. For + |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error on each interval: + - [0, 0x1.cp-7]: 1.93 ULP + _ZGVsMxv_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 want 0x1.fd6868p-9 + - [0x1.cp-7, 4.0]: 1.26 ULP + _ZGVsMxv_erff(0x1.1d002ep+0) got 0x1.c4eb9ap-1 want 0x1.c4eb98p-1. */ +svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + + /* |x| > 1/64 - 1/512. */ + svbool_t a_gt_min = svacgt (pg, x, dat->min); + + /* |x| >= 4.0 - 8/128. */ + svbool_t a_ge_max = svacge (pg, x, dat->max); + svfloat32_t a = svabs_x (pg, x); + + svfloat32_t shift = sv_f32 (dat->shift); + svfloat32_t z = svadd_x (pg, a, shift); + svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff); + i = svadd_x (pg, i, i); + + /* r and erf(r) set to 0 for |x| below min. */ + svfloat32_t r = svsub_z (a_gt_min, z, shift); + svfloat32_t erfr + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i); + + /* scale set to 2/sqrt(pi) for |x| below min. */ + svfloat32_t scale + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i); + scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ + svfloat32_t d = svsub_x (pg, a, r); + svfloat32_t d2 = svmul_x (pg, d, d); + svfloat32_t y = svmla_x (pg, r, d, dat->third); + y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y)); + + /* Solves the |x| = inf case. */ + y = svsel (a_ge_max, sv_f32 (1.0f), y); + + /* Copy sign. */ + svuint32_t ix = svreinterpret_u32 (x); + svuint32_t iy = svreinterpret_u32 (y); + svuint32_t sign = svand_x (pg, ix, SignMask); + return svreinterpret_f32 (svorr_x (pg, sign, iy)); +} + +TEST_SIG (SV, F, 1, erf, -4.0, 4.0) +TEST_ULP (SV_NAME_F1 (erf), 1.43) +TEST_DISABLE_FENV (SV_NAME_F1 (erf)) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp.c b/math/aarch64/sve/exp.c new file mode 100644 index 000000000000..b021e64ffedf --- /dev/null +++ b/math/aarch64/sve/exp.c @@ -0,0 +1,141 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double c0, c2; + double c1, c3; + double ln2_hi, ln2_lo, inv_ln2, shift, thres; + +} data = { + .c0 = 0x1.fffffffffdbcdp-2, + .c1 = 0x1.555555555444cp-3, + .c2 = 0x1.555573c6a9f7dp-5, + .c3 = 0x1.1111266d28935p-7, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* 1/ln2. */ + .inv_ln2 = 0x1.71547652b82fep+0, + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .thres = 704.0, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b + = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ + + /* Set s1 to generate overflow depending on sign of exponent n, + ie. s1 = 0x70...0 - b. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, 1280.0); + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* SVE exp algorithm. Maximum measured error is 1.01ulps: + SV_NAME_D1 (exp)(0x1.4619d7b04da41p+6) got 0x1.885d9acc41da7p+117 + want 0x1.885d9acc41da6p+117. */ +svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svbool_t special = svacgt (pg, x, d->thres); + + /* Use a modifed version of the shift used for flooring, such that x/ln2 is + rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 * + 2^46. + + n is not an integer but can be written as n = m + i/64, with i and m + integer, 0 <= i < 64 and m <= n. + + Bits 5:0 of z will be null every time x/ln2 reaches a new integer value + (n=m, i=0), and is incremented every time z (or n) is incremented by 1/64. + FEXPA expects i in bits 5:0 of the input so it can be used as index into + FEXPA hardwired table T[i] = 2^(i/64) for i = 0:63, that will in turn + populate the mantissa of the output. Therefore, we use u=asuint(z) as + input to FEXPA. + + We add 1023 to the modified shift value in order to set bits 16:6 of u to + 1, such that once these bits are moved to the exponent of the output of + FEXPA, we get the exponent of 2^n right, i.e. we get 2^m. */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t r = svmls_lane (x, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); + svfloat64_t p04 = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmla_x (pg, r, p04, r2); + + /* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for + consistent NaN handling we have to manually propagate them. This comes at + significant performance cost. */ + svfloat64_t s = svexpa (u); + + /* Assemble result as exp(x) = 2^n * exp(r). If |x| > Thresh the + multiplication may overflow, so use special case routine. */ + + if (unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to s. */ + s = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (s))); + return special_case (pg, s, y, n); + } + + /* No special case. */ + return svmla_x (pg, s, s, y); +} + +TEST_SIG (SV, D, 1, exp, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (exp), 1.46) +TEST_DISABLE_FENV (SV_NAME_D1 (exp)) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp10.c b/math/aarch64/sve/exp10.c new file mode 100644 index 000000000000..3d6af334e155 --- /dev/null +++ b/math/aarch64/sve/exp10.c @@ -0,0 +1,131 @@ +/* + * Double-precision SVE 10^x function. + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define SpecialBound 307.0 /* floor (log10 (2^1023)). */ + +static const struct data +{ + double c1, c3, c2, c4, c0; + double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.9fcb9b3p-60 + abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] + max ulp err 0.52 +0.5. */ + .c0 = 0x1.26bb1bbb55516p1, + .c1 = 0x1.53524c73cd32ap1, + .c2 = 0x1.0470591daeafbp1, + .c3 = 0x1.2bd77b1361ef6p0, + .c4 = 0x1.142b5d54e9621p-1, + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ + .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */ + .log2_10_lo = -0x1.9dc1da994fd21p-59, + .scale_thres = 1280.0, + .special_bound = SpecialBound, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, d->scale_thres); + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* Fast vector implementation of exp10 using FEXPA instruction. + Maximum measured error is 1.02 ulp. + SV_NAME_D1 (exp10)(-0x1.2862fec805e58p+2) got 0x1.885a89551d782p-16 + want 0x1.885a89551d781p-16. */ +svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t no_big_scale = svacle (pg, x, d->special_bound); + svbool_t special = svnot_z (pg, no_big_scale); + + /* n = round(x/(log10(2)/N)). */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t z = svmla_x (pg, shift, x, d->log10_2); + svfloat64_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi); + svfloat64_t r = x; + r = svmls_lane (r, n, log2_10, 0); + r = svmls_lane (r, n, log2_10, 1); + + /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so + for consistent NaN handling we have to manually propagate them. This + comes at significant performance cost. */ + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t scale = svexpa (u); + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); + /* Approximate exp10(r) using polynomial. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + svfloat64_t p14 = svmla_x (pg, p12, p34, r2); + + svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14); + + /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound + multiplication may overflow, so use special case routine. */ + if (unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to scale. */ + scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); + return special_case (pg, scale, y, n, d); + } + + /* No special case. */ + return svmla_x (pg, scale, scale, y); +} + +#if WANT_EXP10_TESTS +TEST_SIG (SV, D, 1, exp10, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (exp10), 0.52) +TEST_DISABLE_FENV (SV_NAME_D1 (exp10)) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, SpecialBound, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), SpecialBound, inf, 1000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp10f.c b/math/aarch64/sve/exp10f.c new file mode 100644 index 000000000000..8679df87702f --- /dev/null +++ b/math/aarch64/sve/exp10f.c @@ -0,0 +1,101 @@ +/* + * Single-precision SVE 10^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +/* For x < -Thres, the result is subnormal and not handled correctly by + FEXPA. */ +#define Thres 37.9 + +static const struct data +{ + float log2_10_lo, c0, c2, c4; + float c1, c3, log10_2; + float shift, log2_10_hi, thres; +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 0.52 +0.5 ulp. */ + .c0 = 0x1.26bb16p+1f, + .c1 = 0x1.5350d2p+1f, + .c2 = 0x1.04744ap+1f, + .c3 = 0x1.2d8176p+0f, + .c4 = 0x1.12b41ap-1f, + /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ + .shift = 0x1.803f8p17f, + .log10_2 = 0x1.a934fp+1, + .log2_10_hi = 0x1.344136p-2, + .log2_10_lo = -0x1.ec10cp-27, + .thres = Thres, +}; + +static inline svfloat32_t +sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) +{ + /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ + + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo); + + /* n = round(x/(log10(2)/N)). */ + svfloat32_t shift = sv_f32 (d->shift); + svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift); + svfloat32_t n = svsub_x (svptrue_b32 (), z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x); + r = svmls_lane (r, n, lane_consts, 0); + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct data *d) +{ + return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d), + special); +} + +/* Single-precision SVE exp10f routine. Implements the same algorithm + as AdvSIMD exp10f. + Worst case error is 1.02 ULPs. + _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 + want 0x1.ba5f9cp-1. */ +svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->thres); + if (unlikely (svptest_any (special, special))) + return special_case (x, special, d); + return sv_exp10f_inline (x, pg, d); +} + +#if WANT_EXP10_TESTS +TEST_SIG (SV, F, 1, exp10, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (exp10), 0.52) +TEST_DISABLE_FENV (SV_NAME_F1 (exp10)) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, Thres, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), Thres, inf, 50000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp2.c b/math/aarch64/sve/exp2.c new file mode 100644 index 000000000000..adbe40c648ac --- /dev/null +++ b/math/aarch64/sve/exp2.c @@ -0,0 +1,111 @@ +/* + * Double-precision SVE 2^x function. + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_EXP_TABLE_BITS) + +#define BigBound 1022 +#define UOFlowBound 1280 + +static const struct data +{ + double c0, c2; + double c1, c3; + double shift, big_bound, uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3, + .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7, + .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound, + .big_bound = BigBound, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +/* Update of both special and non-special cases, if any special case is + detected. */ +static inline svfloat64_t +special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, y); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); + + return svsel (p_cmp, r1, r0); +} + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t no_big_scale = svacle (pg, x, d->big_bound); + svbool_t special = svnot_z (pg, no_big_scale); + + /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t kd = svadd_x (pg, x, shift); + svuint64_t ki = svreinterpret_u64 (kd); + /* kd = k/N. */ + kd = svsub_x (pg, kd, shift); + svfloat64_t r = svsub_x (pg, x, kd); + + /* scale ~= 2^(k/N). */ + svuint64_t idx = svand_x (pg, ki, N - 1); + svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); + svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + /* Approximate exp2(r) using polynomial. */ + /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); + svfloat64_t p = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); + /* Assemble exp2(x) = exp2(r) * scale. */ + if (unlikely (svptest_any (pg, special))) + return special_case (pg, scale, y, kd, d); + return svmla_x (pg, scale, scale, y); +} + +TEST_SIG (SV, D, 1, exp2, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (exp2), 1.15) +TEST_DISABLE_FENV (SV_NAME_D1 (exp2)) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp2f.c b/math/aarch64/sve/exp2f.c new file mode 100644 index 000000000000..f4c1d0ae607e --- /dev/null +++ b/math/aarch64/sve/exp2f.c @@ -0,0 +1,83 @@ +/* + * Single-precision SVE 2^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Thres 0x1.5d5e2ap+6f + +static const struct data +{ + float c0, c2, c4, c1, c3; + float shift, thres; +} data = { + /* Coefficients copied from the polynomial in AdvSIMD variant. */ + .c0 = 0x1.62e422p-1f, + .c1 = 0x1.ebf9bcp-3f, + .c2 = 0x1.c6bd32p-5f, + .c3 = 0x1.3ce9e4p-7f, + .c4 = 0x1.59977ap-10f, + /* 1.5*2^17 + 127. */ + .shift = 0x1.803f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ + .thres = Thres, +}; + +static inline svfloat32_t +sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) +{ + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift); + svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift); + svfloat32_t r = svsub_x (svptrue_b32 (), x, n); + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp2(r)-1. + Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for + coefficients 1 to 4, and apply most significant coefficient directly. */ + svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2); + svfloat32_t p14 = svmla_x (pg, p12, r2, p34); + svfloat32_t p0 = svmul_lane (r, even_coeffs, 0); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct data *d) +{ + return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d), + special); +} + +/* Single-precision SVE exp2f routine. Implements the same algorithm + as AdvSIMD exp2f. + Worst case error is 1.04 ULPs. + _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1 + want 0x1.ba6a64p-1. */ +svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->thres); + if (unlikely (svptest_any (special, special))) + return special_case (x, special, d); + return sv_exp2f_inline (x, pg, d); +} + +TEST_SIG (SV, F, 1, exp2, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (exp2), 0.54) +TEST_DISABLE_FENV (SV_NAME_F1 (exp2)) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/expf.c b/math/aarch64/sve/expf.c new file mode 100644 index 000000000000..11528abdbbaf --- /dev/null +++ b/math/aarch64/sve/expf.c @@ -0,0 +1,50 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expf_inline.h" + +/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ +#define Thres 0x1.5d5e2ap+6f + +static const struct data +{ + struct sv_expf_data d; + float thres; +} data = { + .d = SV_EXPF_DATA, + .thres = Thres, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d) +{ + return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special); +} + +/* Optimised single-precision SVE exp function. + Worst-case error is 1.04 ulp: + SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 + want 0x1.ba74bap+4. */ +svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t is_special_case = svacgt (pg, x, d->thres); + if (unlikely (svptest_any (pg, is_special_case))) + return special_case (x, is_special_case, &d->d); + return expf_inline (x, pg, &d->d); +} + +TEST_SIG (SV, F, 1, exp, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (exp), 0.55) +TEST_DISABLE_FENV (SV_NAME_F1 (exp)) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, Thres, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp), Thres, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/expm1.c b/math/aarch64/sve/expm1.c new file mode 100644 index 000000000000..f4fb8cb982f0 --- /dev/null +++ b/math/aarch64/sve/expm1.c @@ -0,0 +1,97 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +#define SpecialBound 0x1.62b7d369a5aa9p+9 +#define ExponentBias 0x3ff0000000000000 + +static const struct data +{ + double poly[11]; + double shift, inv_ln2, special_bound; + /* To be loaded in one quad-word. */ + double ln2_hi, ln2_lo; +} data = { + /* Generated using fpminimax. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13, + 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .special_bound = SpecialBound, + .inv_ln2 = 0x1.71547652b82fep0, + .ln2_hi = 0x1.62e42fefa39efp-1, + .ln2_lo = 0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + return sv_call_f64 (expm1, x, y, pg); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.18 ULP: + _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 + want 0x1.a8b9ea8d66e2p-2. */ +svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Large, Nan/Inf. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift); + svint64_t i = svcvt_s64_x (pg, n); + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t f = svmls_lane (x, n, ln2, 0); + f = svmls_lane (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t f8 = svmul_x (pg, f4, f4); + svfloat64_t p + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias); + svfloat64_t t = svreinterpret_f64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, y, special); + + return y; +} + +TEST_SIG (SV, D, 1, expm1, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (expm1), 1.68) +TEST_DISABLE_FENV (SV_NAME_D1 (expm1)) +TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000) +TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/expm1f.c b/math/aarch64/sve/expm1f.c new file mode 100644 index 000000000000..95f7c09a403d --- /dev/null +++ b/math/aarch64/sve/expm1f.c @@ -0,0 +1,91 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Largest value of x for which expm1(x) should round to -1. */ +#define SpecialBound 0x1.5ebc4p+6f + +static const struct data +{ + /* These 4 are grouped together so they can be loaded as one quadword, then + used with _lane forms of svmla/svmls. */ + float c2, c4, ln2_hi, ln2_lo; + float c0, inv_ln2, c1, c3, special_bound; +} data = { + /* Generated using fpminimax. */ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, + .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f, + .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f, + .ln2_hi = 0x1.62e4p-1f, + +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t pg) +{ + return sv_call_f32 (expm1f, x, x, pg); +} + +/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP: + _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2 + want 0x1.e859d4p-2. */ +svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Large, NaN/Inf. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg); + + /* This vector is reliant on layout of data - it contains constants + that can be used with _lane forms of svmla/svmls. Values are: + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */ + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); + j = svrinta_x (pg, j); + + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); + f = svmls_lane (f, j, lane_constants, 3); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); + svfloat32_t p = svmla_x (pg, p12, f2, p34); + + p = svmla_x (pg, sv_f32 (d->c0), f, p); + p = svmla_x (pg, f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); +} + +TEST_SIG (SV, F, 1, expm1, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (expm1), 1.02) +TEST_DISABLE_FENV (SV_NAME_F1 (expm1)) +TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/hypot.c b/math/aarch64/sve/hypot.c new file mode 100644 index 000000000000..2ed298623acc --- /dev/null +++ b/math/aarch64/sve/hypot.c @@ -0,0 +1,53 @@ +/* + * Double-precision SVE hypot(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64_t tiny_bound, thres; +} data = { + .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */ + .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */ +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330) + got 0x1.6a22d0412cfp+352 + want 0x1.6a22d0412cf01p+352. */ +svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres); + + if (unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + return svsqrt_x (pg, sqsum); +} + +TEST_SIG (SV, D, 2, hypot, -10.0, 10.0) +TEST_ULP (SV_NAME_D2 (hypot), 0.71) +TEST_DISABLE_FENV (SV_NAME_D2 (hypot)) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/hypotf.c b/math/aarch64/sve/hypotf.c new file mode 100644 index 000000000000..b977b998986b --- /dev/null +++ b/math/aarch64/sve/hypotf.c @@ -0,0 +1,47 @@ +/* + * Single-precision SVE hypot(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define TinyBound 0x0c800000 /* asuint (0x1p-102). */ +#define Thres 0x73000000 /* 0x70000000 - TinyBound. */ + +static svfloat32_t NOINLINE +special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg, + svbool_t special) +{ + return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special); +} + +/* SVE implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19 + want 0x1.6a2344p-19. */ +svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y, + const svbool_t pg) +{ + svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y); + + svbool_t special = svcmpge ( + pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres); + + if (unlikely (svptest_any (pg, special))) + return special_case (sqsum, x, y, pg, special); + + return svsqrt_x (pg, sqsum); +} + +TEST_SIG (SV, F, 2, hypot, -10.0, 10.0) +TEST_ULP (SV_NAME_F2 (hypot), 0.71) +TEST_DISABLE_FENV (SV_NAME_F2 (hypot)) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log.c b/math/aarch64/sve/log.c new file mode 100644 index 000000000000..c612df48c1fd --- /dev/null +++ b/math/aarch64/sve/log.c @@ -0,0 +1,97 @@ +/* + * Double-precision SVE log(x) function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_LOG_TABLE_BITS) +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ + +static const struct data +{ + double c0, c2; + double c1, c3; + double ln2, c4; + uint64_t off; +} data = { + .c0 = -0x1.ffffffffffff7p-2, + .c1 = 0x1.55555555170d4p-2, + .c2 = -0x1.0000000399c27p-2, + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .off = 0x3fe6900900000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) +{ + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special); +} + +/* Double-precision SVE log routine. + Maximum measured error is 2.64 ulp: + SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 + want 0x1.fffffffe88cafp+6. */ +svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + /* Lookup in 2 global lists (length N). */ + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + /* hi = r + log(c) + k*Ln2. */ + svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2); + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0); + hi = svadd_x (pg, r, hi); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_lane_f64 (y, r2, ln2_and_c4, 1); + y = svmla_x (pg, p, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (hi, tmp, y, r2, special, d); + return svmla_x (pg, hi, r2, y); +} + +TEST_SIG (SV, D, 1, log, 0.01, 11.1) +TEST_ULP (SV_NAME_D1 (log), 2.15) +TEST_DISABLE_FENV (SV_NAME_D1 (log)) +TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000) +TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000) +TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log10.c b/math/aarch64/sve/log10.c new file mode 100644 index 000000000000..5af142d79f55 --- /dev/null +++ b/math/aarch64/sve/log10.c @@ -0,0 +1,101 @@ +/* + * Double-precision SVE log10(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Min 0x0010000000000000 +#define Max 0x7ff0000000000000 +#define Thres 0x7fe0000000000000 /* Max - Min. */ +#define N (1 << V_LOG10_TABLE_BITS) + +static const struct data +{ + double c0, c2; + double c1, c3; + double invln10, log10_2; + double c4; + uint64_t off; +} data = { + .c0 = -0x1.bcb7b1526e506p-3, + .c1 = 0x1.287a7636be1d1p-3, + .c2 = -0x1.bcb7b158af938p-4, + .c3 = 0x1.63c78734e6d07p-4, + .c4 = -0x1.287461742fee4p-4, + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = 0x3fe6900900000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) +{ + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special); +} + +/* Double-precision SVE log10 routine. + Maximum measured error is 2.46 ulps. + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ +svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); + + /* log(x) = k*log(2) + log(c) + log(z/c). */ + svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i); + svfloat64_t logc + = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i); + + /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1): + r = z/c - 1 (we look up precomputed 1/c) + log(z/c) ~= P(r). */ + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + + /* hi = log(c) + k*log(2). */ + svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10); + svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0); + svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_x (pg, y, r2, d->c4); + y = svmla_x (pg, p, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (hi, tmp, y, r2, special, d); + return svmla_x (pg, hi, r2, y); +} + +TEST_SIG (SV, D, 1, log10, 0.01, 11.1) +TEST_ULP (SV_NAME_D1 (log10), 1.97) +TEST_DISABLE_FENV (SV_NAME_D1 (log10)) +TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100) +TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log10f.c b/math/aarch64/sve/log10f.c new file mode 100644 index 000000000000..6c3add451761 --- /dev/null +++ b/math/aarch64/sve/log10f.c @@ -0,0 +1,102 @@ +/* + * Single-precision SVE log10 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float poly_0246[4]; + float poly_1357[4]; + float ln2, inv_ln10; + uint32_t off, lower; +} data = { + .poly_1357 = { + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs + 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane + variant of MLA intrinsic. */ + 0x1.2879c8p-3f, 0x1.6408f8p-4f, 0x1.f0e514p-5f, 0x1.f5f76ap-5f + }, + .poly_0246 = { -0x1.bcb79cp-3f, -0x1.bcd472p-4f, -0x1.246f8p-4f, + -0x1.0fc92cp-4f }, + .ln2 = 0x1.62e43p-1f, + .inv_ln10 = 0x1.bcb7b2p-2f, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab +}; + +#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */ +#define MantissaMask 0x007fffff + +static svfloat32_t NOINLINE +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) +{ + return sv_call_f32 ( + log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); +} + +/* Optimised implementation of SVE log10f using the same algorithm and + polynomial as AdvSIMD log10f. + Maximum error is 3.31ulps: + SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */ + svuint32_t ix = svand_x (pg, u_off, MantissaMask); + ix = svadd_x (pg, ix, d->off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); + + /* y = log10(1+r) + n*log10(2) + log10(1+r) ~ r * InvLn(10) + P(r) + where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for + log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2); + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_0246[2]), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_0246[3]), r, p_1357, 3); + svfloat32_t q_47 = svmla_x (pg, q_45, r2, q_67); + svfloat32_t q_03 = svmla_x (pg, q_01, r2, q_23); + svfloat32_t y = svmla_x (pg, q_03, r4, q_47); + + /* Using hi = Log10(2)*n + r*InvLn(10) is faster but less accurate. */ + svfloat32_t hi = svmla_x (pg, r, n, d->ln2); + hi = svmul_x (pg, hi, d->inv_ln10); + + if (unlikely (svptest_any (pg, special))) + return special_case (u_off, hi, r2, y, special); + return svmla_x (svptrue_b32 (), hi, r2, y); +} + +TEST_SIG (SV, F, 1, log10, 0.01, 11.1) +TEST_ULP (SV_NAME_F1 (log10), 2.82) +TEST_DISABLE_FENV (SV_NAME_F1 (log10)) +TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100) +TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log1p.c b/math/aarch64/sve/log1p.c new file mode 100644 index 000000000000..e6b895b52908 --- /dev/null +++ b/math/aarch64/sve/log1p.c @@ -0,0 +1,118 @@ +/* + * Double-precision SVE log(1+x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double poly[19]; + double ln2_hi, ln2_lo; + uint64_t hfrt2_top, onemhfrt2_top, inf, mone; +} data = { + /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20 + polynomial, however first 2 coefficients are 0 and 1 so are not stored. */ + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* top32(asuint64(sqrt(2)/2)) << 32. */ + .hfrt2_top = 0x3fe6a09e00000000, + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ + .onemhfrt2_top = 0x00095f6200000000, + .inf = 0x7ff0000000000000, + .mone = 0xbff0000000000000, +}; + +#define AbsMask 0x7fffffffffffffff +#define BottomMask 0xffffffff + +static svfloat64_t NOINLINE +special_case (svbool_t special, svfloat64_t x, svfloat64_t y) +{ + return sv_call_f64 (log1p, x, y, special); +} + +/* Vector approximation for log1p using polynomial on reduced interval. Maximum + observed error is 2.46 ULP: + _ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 + want 0x1.fd5565fb590f6p+2. */ +svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t ax = svand_x (pg, ix, AbsMask); + svbool_t special + = svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone)); + + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f + is in [sqrt(2)/2, sqrt(2)]): + log1p(x) = k*log(2) + log1p(f). + + f may not be representable exactly, so we need a correction term: + let m = round(1 + x), c = (1 + x) - m. + c << m: at very small x, log1p(x) ~ x, hence: + log(1+x) - log(m) ~ c/m. + + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ + + /* Obtain correctly scaled k by manipulation in the exponent. + The scalar algorithm casts down to 32-bit at this point to calculate k and + u_red. We stay in double-width to obtain f and k, using the same constants + as the scalar algorithm but shifted left by 32. */ + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); + svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top); + + svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), 0x3ff); + svfloat64_t k = svcvt_f64_x (pg, ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top); + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask)); + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m); + + /* Approximate log1p(x) on the reduced input using a polynomial. Because + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... + Assembling this all correctly is dealt with at the final step. */ + svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2), + f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8); + svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly); + + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi); + svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); + + if (unlikely (svptest_any (pg, special))) + return special_case (special, x, y); + + return y; +} + +TEST_SIG (SV, D, 1, log1p, -0.9, 10.0) +TEST_ULP (SV_NAME_D1 (log1p), 1.97) +TEST_DISABLE_FENV (SV_NAME_D1 (log1p)) +TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000) +TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log1pf.c b/math/aarch64/sve/log1pf.c new file mode 100644 index 000000000000..77ae6218f931 --- /dev/null +++ b/math/aarch64/sve/log1pf.c @@ -0,0 +1,43 @@ +/* + * Single-precision vector log(x + 1) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_log1pf_inline.h" + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special) +{ + return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()), + special); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.27 ULP very close to 0.5. + _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2 + want 0x1.9f323ep-2. */ +svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) +{ + /* x < -1, Inf/Nan. */ + svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); + special = svorn_z (pg, special, svcmpge (pg, x, -1)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special); + + return sv_log1pf_inline (x, pg); +} + +TEST_SIG (SV, F, 1, log1p, -0.9, 10.0) +TEST_ULP (SV_NAME_F1 (log1p), 0.77) +TEST_DISABLE_FENV (SV_NAME_F1 (log1p)) +TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000) +TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000) +TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000) +TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log2.c b/math/aarch64/sve/log2.c new file mode 100644 index 000000000000..11c65c1b2963 --- /dev/null +++ b/math/aarch64/sve/log2.c @@ -0,0 +1,96 @@ +/* + * Double-precision SVE log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_LOG2_TABLE_BITS) +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ + +static const struct data +{ + double c0, c2; + double c1, c3; + double invln2, c4; + uint64_t off; +} data = { + .c0 = -0x1.71547652b83p-1, + .c1 = 0x1.ec709dc340953p-2, + .c2 = -0x1.71547651c8f35p-2, + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = 0x3fe6900900000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) +{ + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special); +} + +/* Double-precision SVE log2 routine. + Implements the same algorithm as AdvSIMD log10, with coefficients and table + entries scaled in extended precision. + The maximum observed error is 2.58 ULP: + SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); + + svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i); + svfloat64_t log2c + = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + + svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0); + w = svadd_x (pg, k, w); + + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_lane_f64 (y, r2, invln2_and_c4, 1); + y = svmla_x (pg, p, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (w, tmp, y, r2, special, d); + return svmla_x (pg, w, r2, y); +} + +TEST_SIG (SV, D, 1, log2, 0.01, 11.1) +TEST_ULP (SV_NAME_D1 (log2), 2.09) +TEST_DISABLE_FENV (SV_NAME_D1 (log2)) +TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000) +TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log2f.c b/math/aarch64/sve/log2f.c new file mode 100644 index 000000000000..312fd448226b --- /dev/null +++ b/math/aarch64/sve/log2f.c @@ -0,0 +1,94 @@ +/* + * Single-precision vector/SVE log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float poly_02468[5]; + float poly_1357[4]; + uint32_t off, lower; +} data = { + .poly_1357 = { + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs + 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane + variant of MLA intrinsic. */ + -0x1.715458p-1f, -0x1.7171a4p-2f, -0x1.e5143ep-3f, -0x1.c675bp-3f + }, + .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, + 0x1.9d8ecap-3f, 0x1.9e495p-3f }, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab +}; + +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ +#define MantissaMask (0x007fffff) + +static svfloat32_t NOINLINE +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) +{ + return sv_call_f32 ( + log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); +} + +/* Optimised implementation of SVE log2f, using the same algorithm + and polynomial as AdvSIMD log2f. + Maximum error is 2.48 ULPs: + SV_NAME_F1 (log2)(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + svuint32_t u = svand_x (pg, u_off, MantissaMask); + u = svadd_x (pg, u, d->off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); + + /* y = log2(1+r) + n. */ + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + + /* Evaluate polynomial using pairwise Horner scheme. */ + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_02468[0]), r, p_1357, 0); + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_02468[1]), r, p_1357, 1); + svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_02468[2]), r, p_1357, 2); + svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_02468[3]), r, p_1357, 3); + svfloat32_t y = svmla_x (pg, q_67, r2, sv_f32 (d->poly_02468[4])); + y = svmla_x (pg, q_45, r2, y); + y = svmla_x (pg, q_23, r2, y); + y = svmla_x (pg, q_01, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (u_off, n, r, y, special); + return svmla_x (svptrue_b32 (), n, r, y); +} + +TEST_SIG (SV, F, 1, log2, 0.01, 11.1) +TEST_ULP (SV_NAME_F1 (log2), 1.99) +TEST_DISABLE_FENV (SV_NAME_F1 (log2)) +TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000) +TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/logf.c b/math/aarch64/sve/logf.c new file mode 100644 index 000000000000..2898e36974d6 --- /dev/null +++ b/math/aarch64/sve/logf.c @@ -0,0 +1,94 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float poly_0135[4]; + float poly_246[3]; + float ln2; + uint32_t off, lower; +} data = { + .poly_0135 = { + /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so + that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used + with _lane variant of MLA intrinsic. */ + -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f + }, + .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, + .ln2 = 0x1.62e43p-1f, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab +}; + +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ +#define Mask (0x007fffff) + +static svfloat32_t NOINLINE +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) +{ + return sv_call_f32 ( + logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); +} + +/* Optimised implementation of SVE logf, using the same algorithm and + polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs: + SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2 + want 0x1.26ede6p-2. */ +svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + svfloat32_t n = svcvt_f32_x ( + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + + svuint32_t u = svand_x (pg, u_off, Mask); + u = svadd_x (pg, u, d->off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); + + /* y = log(1+r) + n*ln2. */ + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ + svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); + svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); + svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2); + svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3); + p = svmla_lane (p, r2, p_0135, 0); + + q = svmla_x (pg, q, r2, p); + y = svmla_x (pg, y, r2, q); + p = svmla_x (pg, r, n, d->ln2); + + if (unlikely (svptest_any (pg, cmp))) + return special_case (u_off, p, r2, y, cmp); + return svmla_x (pg, p, r2, y); +} + +TEST_SIG (SV, F, 1, log, 0.01, 11.1) +TEST_ULP (SV_NAME_F1 (log), 2.85) +TEST_DISABLE_FENV (SV_NAME_F1 (log)) +TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100) +TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100) +TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/modf.c b/math/aarch64/sve/modf.c new file mode 100644 index 000000000000..5944c7d37c4c --- /dev/null +++ b/math/aarch64/sve/modf.c @@ -0,0 +1,36 @@ +/* + * Double-precision SVE modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modf algorithm. Produces exact values in all rounding modes. */ +svfloat64_t SV_NAME_D1_L1 (modf) (svfloat64_t x, double *out_int, + const svbool_t pg) +{ + /* Get integer component of x. */ + svfloat64_t fint_comp = svrintz_x (pg, x); + + svst1_f64 (pg, out_int, fint_comp); + + /* Subtract integer component from input. */ + svfloat64_t remaining = svsub_f64_x (svptrue_b64 (), x, fint_comp); + + /* Return +0 for integer x. */ + svbool_t is_integer = svcmpeq (pg, x, fint_comp); + return svsel (is_integer, sv_f64 (0), remaining); +} + +TEST_ULP (_ZGVsMxvl8_modf_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_frac, 1, inf, 20000) + +TEST_ULP (_ZGVsMxvl8_modf_int, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_int, 1, inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/modff.c b/math/aarch64/sve/modff.c new file mode 100644 index 000000000000..ad7ce4e2c88f --- /dev/null +++ b/math/aarch64/sve/modff.c @@ -0,0 +1,36 @@ +/* + * Single-precision SVE modff(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modff algorithm. Produces exact values in all rounding modes. */ +svfloat32_t SV_NAME_F1_L1 (modf) (svfloat32_t x, float *out_int, + const svbool_t pg) +{ + /* Get integer component of x. */ + svfloat32_t fint_comp = svrintz_x (pg, x); + + svst1_f32 (pg, out_int, fint_comp); + + /* Subtract integer component from input. */ + svfloat32_t remaining = svsub_f32_x (svptrue_b32 (), x, fint_comp); + + /* Return +0 for integer x. */ + svbool_t is_integer = svcmpeq (pg, x, fint_comp); + return svsel (is_integer, sv_f32 (0), remaining); +} + +TEST_ULP (_ZGVsMxvl4_modff_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_frac, 1, inf, 20000) + +TEST_ULP (_ZGVsMxvl4_modff_int, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_int, 1, inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/pow.c b/math/aarch64/sve/pow.c new file mode 100644 index 000000000000..12b2fb42b2cb --- /dev/null +++ b/math/aarch64/sve/pow.c @@ -0,0 +1,483 @@ +/* + * Double-precision SVE pow(x, y) function. + * + * Copyright (c) 2022-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* This version share a similar algorithm as AOR scalar pow. + + The core computation consists in computing pow(x, y) as + + exp (y * log (x)). + + The algorithms for exp and log are very similar to scalar exp and log. + The log relies on table lookup for 3 variables and an order 8 polynomial. + It returns a high and a low contribution that are then passed to the exp, + to minimise the loss of accuracy in both routines. + The exp is based on 8-bit table lookup for scale and order-4 polynomial. + The SVE algorithm drops the tail in the exp computation at the price of + a lower accuracy, slightly above 1ULP. + The SVE algorithm also drops the special treatement of small (< 2^-65) and + large (> 2^63) finite values of |y|, as they only affect non-round to + nearest modes. + + Maximum measured error is 1.04 ULPs: + SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) + got 0x1.f7116284221fcp-1 + want 0x1.f7116284221fdp-1. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define Off 0x3fe6955500000000 + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define HugeExp 0x409 /* top12(1024.). */ + +/* Constants associated with pow. */ +#define SmallBoundX 0x1p-126 +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +static const struct data +{ + double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo; + double log_c1, log_c3, log_c5, off; + double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo; + double exp_c0, exp_c1; +} data = { + .log_c0 = -0x1p-1, + .log_c1 = -0x1.555555555556p-1, + .log_c2 = 0x1.0000000000006p-1, + .log_c3 = 0x1.999999959554ep-1, + .log_c4 = -0x1.555555529a47ap-1, + .log_c5 = -0x1.2495b9b4845e9p0, + .log_c6 = 0x1.0002b8b263fc3p0, + .off = Off, + .exp_c0 = 0x1.fffffffffffd4p-2, + .exp_c1 = 0x1.5555571d6ef9p-3, + .exp_c2 = 0x1.5555576a5adcep-5, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP, + .ln2_over_n_hi = 0x1.62e42fefc0000p-9, + .ln2_over_n_lo = -0x1.c610ca86c3899p-45, +}; + +/* Check if x is an integer. */ +static inline svbool_t +sv_isint (svbool_t pg, svfloat64_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +sv_isnotint (svbool_t pg, svfloat64_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +sv_isodd (svbool_t pg, svfloat64_t x) +{ + svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5); + return sv_isnotint (pg, y); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Top 12 bits (sign and exponent of each double float lane). */ +static inline svuint64_t +sv_top12 (svfloat64_t x) +{ + return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint64_t i) +{ + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), + 2 * asuint64 (INFINITY) - 1); +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +specialcase (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale; + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + return 0x1p1009 * (scale + scale * tmp); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + double y = scale + scale * tmp; + return 0x1p-1022 * y; +} + +/* Scalar fallback for special cases of SVE pow's exp. */ +static inline svfloat64_t +sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double sx1 = svclastb (p, 0, x1); + uint64_t su1 = svclastb (p, 0, u1); + uint64_t su2 = svclastb (p, 0, u2); + double elem = specialcase (sx1, su1, su2); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline svfloat64_t +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail, + const struct data *d) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), + sv_u64 (N_LOG - 1)); + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); + svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52)); + svfloat64_t z = svreinterpret_f64 (iz); + svfloat64_t kd = svcvt_f64_x (pg, k); + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version + that uses array of structures. We also do the lookup earlier in the code + to make sure it finishes as early as possible. */ + svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); + svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); + svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + svfloat64_t r = svmad_x (pg, z, invc, -1.0); + /* k*Ln2 + log(c) + r. */ + + svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi); + svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0); + svfloat64_t t2 = svadd_x (pg, t1, r); + svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1); + svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + + svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0); + svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0); + svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar); + svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + svfloat64_t hi = svadd_x (pg, t2, ar2); + svfloat64_t lo3 = svmls_x (pg, ar2, ar, r); + svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * + A[6])))). */ + + svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4); + svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1); + svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0); + svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1); + svfloat64_t p = svmla_x (pg, a34, ar2, a56); + p = svmla_x (pg, a12, ar2, p); + p = svmul_x (svptrue_b64 (), ar3, p); + svfloat64_t lo = svadd_x ( + pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + svfloat64_t y = svadd_x (pg, hi, lo); + *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); + return y; +} + +static inline svfloat64_t +sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits, + svuint64_t *ki, const struct data *d) +{ + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2); + svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svrinta_x (pg, z); + *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd)); + + svfloat64_t ln2_over_n_hilo + = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi); + svfloat64_t r = x; + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0); + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, *ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + *sbits = svadd_x (pg, *sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1); + *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp); + *tmp = svmla_x (pg, r, r2, *tmp); + svfloat64_t scale = svreinterpret_f64 (*sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, *tmp); + return z; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline svfloat64_t +sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias, const struct data *d) +{ + /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) + and other cases of large values of x (scale * (1 + TMP) oflow). */ + svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff); + /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ + svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); + + svfloat64_t tmp; + svuint64_t sbits, ki; + if (unlikely (svptest_any (pg, uoflow))) + { + svfloat64_t z + = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); + + /* |x| is tiny (|x| <= 0x1p-54). */ + svbool_t uflow + = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + uflow = svand_z (pg, uoflow, uflow); + /* |x| is huge (|x| >= 1024). */ + svbool_t oflow = svcmpge (pg, abstop, HugeExp); + oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow + or underflow. */ + svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + + /* Update result with special and large cases. */ + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svbool_t x_is_neg = svcmplt (pg, x, 0); + svuint64_t sign_mask + = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow + = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + + z = svsel (oflow, res_uoflow, z); + z = svsel (uflow, res_spurious_uflow, z); + return z; + } + + return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); +} + +static inline double +pow_sc (double x, double y) +{ + uint64_t ix = asuint64 (x); + uint64_t iy = asuint64 (y); + /* Special cases: |x| or |y| is 0, inf or nan. */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double_t x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + x2 = -x2; + return (iy >> 63) ? 1 / x2 : x2; + } + return x; +} + +svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* This preamble handles special case conditions used in the final scalar + fallbacks. It also updates ix and sign_bias, that are used in the core + computation too, i.e., exp( y * log (x) ). */ + svuint64_t vix0 = svreinterpret_u64 (x); + svuint64_t viy0 = svreinterpret_u64 (y); + + /* Negative x cases. */ + svbool_t xisneg = svcmplt (pg, x, 0); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yint_or_xpos = pg; + svuint64_t sign_bias = sv_u64 (0); + svuint64_t vix = vix0; + if (unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yint_or_xpos = sv_isint (xisneg, y); + svbool_t yisodd_xisneg = sv_isodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); + } + + /* Small cases of x: |x| < 0x1p-126. */ + svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX); + if (unlikely (svptest_any (yint_or_xpos, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52); + svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0); + + svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); + vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); + vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52); + vix = svsel (topx_is_null, vix_norm, vix); + } + + /* y_hi = log(ix, &y_lo). */ + svfloat64_t vlo; + svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d); + + /* z = exp(y_hi, y_lo, sign_bias). */ + svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi); + svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi); + svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo); + svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d); + + /* Cases of finite y and finite negative x. */ + vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan (""))); + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0); + svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0); + svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial); + + /* Cases of zero/inf/nan x or y. */ + if (unlikely (svptest_any (svptrue_b64 (), special))) + vz = sv_call2_f64 (pow_sc, x, y, vz, special); + + return vz; +} + +TEST_SIG (SV, D, 2, pow) +TEST_ULP (SV_NAME_D2 (pow), 0.55) +TEST_DISABLE_FENV (SV_NAME_D2 (pow)) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +SV_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +/* around estimated argmaxs of ULP error. */ +SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* |x| is inf, y is odd or even integer, or y is real not integer. */ +SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1) +SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1) +SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1) +SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1) +/* 0.0^y. */ +SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) +/* 1.0^y. */ +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/powf.c b/math/aarch64/sve/powf.c new file mode 100644 index 000000000000..8457e83e7495 --- /dev/null +++ b/math/aarch64/sve/powf.c @@ -0,0 +1,363 @@ +/* + * Single-precision SVE powf function. + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* The following data is used in the SVE pow core computation + and special case detection. */ +#define Tinvc __v_powf_data.invc +#define Tlogc __v_powf_data.logc +#define Texp __v_powf_data.scale +#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) +#define Norm 0x1p23f /* 0x4b000000. */ + +/* Overall ULP error bound for pow is 2.6 ulp + ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +static const struct data +{ + double log_poly[4]; + double exp_poly[3]; + float uflow_bound, oflow_bound, small_bound; + uint32_t sign_bias, subnormal_bias, off; +} data = { + /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of + V_POWF_EXP2_N. */ + .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3, + -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 }, + /* rel err: 1.69 * 2^-34. */ + .exp_poly = { + 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */ + 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */ + 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */ + }, + .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */ + .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */ + .small_bound = 0x1p-126f, + .off = 0x3f35d000, + .sign_bias = SignBias, + .subnormal_bias = 0x0b800000, /* 23 << 23. */ +}; + +#define A(i) sv_f64 (d->log_poly[i]) +#define C(i) sv_f64 (d->exp_poly[i]) + +/* Check if x is an integer. */ +static inline svbool_t +svisint (svbool_t pg, svfloat32_t x) +{ + return svcmpeq (pg, svrintz_z (pg, x), x); +} + +/* Check if x is real not integer valued. */ +static inline svbool_t +svisnotint (svbool_t pg, svfloat32_t x) +{ + return svcmpne (pg, svrintz_z (pg, x), x); +} + +/* Check if x is an odd integer. */ +static inline svbool_t +svisodd (svbool_t pg, svfloat32_t x) +{ + svfloat32_t y = svmul_x (pg, x, 0.5f); + return svisnotint (pg, y); +} + +/* Check if zero, inf or nan. */ +static inline svbool_t +sv_zeroinfnan (svbool_t pg, svuint32_t i) +{ + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), + 2u * 0x7f800000 - 1); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint32_t iy) +{ + int e = iy >> 23 & 0xff; + if (e < 0x7f) + return 0; + if (e > 0x7f + 23) + return 2; + if (iy & ((1 << (0x7f + 23 - e)) - 1)) + return 0; + if (iy & (1 << (0x7f + 23 - e))) + return 1; + return 2; +} + +/* Check if zero, inf or nan. */ +static inline int +zeroinfnan (uint32_t ix) +{ + return 2 * ix - 1 >= 2u * 0x7f800000 - 1; +} + +/* A scalar subroutine used to fix main power special cases. Similar to the + preamble of scalar powf except that we do not update ix and sign_bias. This + is done in the preamble of the SVE powf. */ +static inline float +powf_specialcase (float x, float y, float z) +{ + uint32_t ix = asuint (x); + uint32_t iy = asuint (y); + /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignalingf_inline (x) ? x + y : 1.0f; + if (ix == 0x3f800000) + return issignalingf_inline (y) ? x + y : 1.0f; + if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000) + return x + y; + if (2 * ix == 2 * 0x3f800000) + return 1.0f; + if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000)) + return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + float_t x2 = x * x; + if (ix & 0x80000000 && checkint (iy) == 1) + x2 = -x2; + return iy & 0x80000000 ? 1 / x2 : x2; + } + /* We need a return here in case x<0 and y is integer, but all other tests + need to be run. */ + return z; +} + +/* Scalar fallback for special case routines with custom signature. */ +static svfloat32_t NOINLINE +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) +{ + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1)); + svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2)); + svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial); + + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float sx1 = svclastb (p, 0, x1); + float sx2 = svclastb (p, 0, x2); + float elem = svclastb (p, 0, y); + elem = powf_specialcase (sx1, sx2, elem); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +/* Compute core for half of the lanes in double precision. */ +static inline svfloat64_t +sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, + svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx, + const struct data *d) +{ + svfloat64_t invc = svld1_gather_index (pg, Tinvc, i); + svfloat64_t logc = svld1_gather_index (pg, Tlogc, i); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc); + svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k)); + + /* Polynomial to approximate log1p(r)/ln2. */ + svfloat64_t logx = A (0); + logx = svmad_x (pg, r, logx, A (1)); + logx = svmad_x (pg, r, logx, A (2)); + logx = svmad_x (pg, r, logx, A (3)); + logx = svmad_x (pg, r, logx, y0); + *pylogx = svmul_x (pg, y, logx); + + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx); + svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd)); + + r = svsub_x (pg, *pylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + svuint64_t t = svld1_gather_index ( + svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias); + t = svadd_x (svptrue_b64 (), t, + svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svfloat64_t s = svreinterpret_f64 (t); + + svfloat64_t p = C (0); + p = svmla_x (pg, C (1), p, r); + p = svmla_x (pg, C (2), p, r); + p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r)); + + return p; +} + +/* Widen vector to double precision and compute core on both halves of the + vector. Lower cost of promotion by considering all lanes active. */ +static inline svfloat32_t +sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, + svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx, + const struct data *d) +{ + const svbool_t ptrue = svptrue_b64 (); + + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two + in order to perform core computation in double precision. */ + const svbool_t pg_lo = svunpklo (pg); + const svbool_t pg_hi = svunpkhi (pg); + svfloat64_t y_lo + = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi + = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz))); + svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz))); + svuint64_t i_lo = svunpklo (i); + svuint64_t i_hi = svunpkhi (i); + svint64_t k_lo = svunpklo (k); + svint64_t k_hi = svunpkhi (k); + svuint64_t sign_bias_lo = svunpklo (sign_bias); + svuint64_t sign_bias_hi = svunpkhi (sign_bias); + + /* Compute each part in double precision. */ + svfloat64_t ylogx_lo, ylogx_hi; + svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo, + sign_bias_lo, &ylogx_lo, d); + svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi, + sign_bias_hi, &ylogx_hi, d); + + /* Convert back to single-precision and interleave. */ + svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo); + svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi); + *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32); + svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo); + svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi); + return svuzp1 (lo_32, hi_32); +} + +/* Implementation of SVE powf. + Provides the same accuracy as AdvSIMD powf, since it relies on the same + algorithm. The theoretical maximum error is under 2.60 ULPs. + Maximum measured error is 2.57 ULPs: + SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127 + want 0x1.fff862p+127. */ +svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint32_t vix0 = svreinterpret_u32 (x); + svuint32_t viy0 = svreinterpret_u32 (y); + + /* Negative x cases. */ + svbool_t xisneg = svcmplt (pg, x, sv_f32 (0)); + + /* Set sign_bias and ix depending on sign of x and nature of y. */ + svbool_t yint_or_xpos = pg; + svuint32_t sign_bias = sv_u32 (0); + svuint32_t vix = vix0; + if (unlikely (svptest_any (pg, xisneg))) + { + /* Determine nature of y. */ + yint_or_xpos = svisint (xisneg, y); + svbool_t yisodd_xisneg = svisodd (xisneg, y); + /* ix set to abs(ix) if y is integer. */ + vix = svand_m (yint_or_xpos, vix0, 0x7fffffff); + /* Set to SignBias if x is negative and y is odd. */ + sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); + } + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (pg, vix0); + svbool_t yspecial = sv_zeroinfnan (pg, viy0); + svbool_t cmp = svorr_z (pg, xspecial, yspecial); + + /* Small cases of x: |x| < 0x1p-126. */ + svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound); + if (unlikely (svptest_any (yint_or_xpos, xsmall))) + { + /* Normalize subnormal x so exponent becomes negative. */ + svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); + vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff); + vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias); + vix = svsel (xsmall, vix_norm, vix); + } + /* Part of core computation carried in working precision. */ + svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off); + svuint32_t i = svand_x ( + yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000); + svuint32_t iz = svsub_x (yint_or_xpos, vix, top); + svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top), + (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results + to handle cases of underflow and underflow in exp. */ + svfloat32_t ylogx; + svfloat32_t ret + = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d); + + /* Handle exp special cases of underflow and overflow. */ + svuint32_t sign + = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svfloat32_t ret_oflow + = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY))); + svfloat32_t ret_uflow = svreinterpret_f32 (sign); + ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret); + + /* Cases of finite y and finite negative x. */ + ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); + + if (unlikely (svptest_any (cmp, cmp))) + return sv_call_powf_sc (x, y, ret); + + return ret; +} + +TEST_SIG (SV, F, 2, pow) +TEST_ULP (SV_NAME_F2 (pow), 2.08) +TEST_DISABLE_FENV (SV_NAME_F2 (pow)) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n) +SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000) +SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000) +SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000) +/* x~1 or y~1. */ +SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +/* around estimated argmaxs of ULP error. */ +SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* |x| is inf, y is odd or even integer, or y is real not integer. */ +SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1) +SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1) +SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1) +SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1) +/* 0.0^y. */ +SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) +/* 1.0^y. */ +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sin.c b/math/aarch64/sve/sin.c new file mode 100644 index 000000000000..7e22515ceb79 --- /dev/null +++ b/math/aarch64/sve/sin.c @@ -0,0 +1,98 @@ +/* + * Double-precision SVE sin(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double inv_pi, pi_1, pi_2, pi_3, shift, range_val; + double poly[7]; +} data = { + .poly = { -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, -0x1.a01a019936f27p-13, + 0x1.71de37a97d93ep-19, -0x1.ae633919987c6p-26, + 0x1.60e277ae07cecp-33, -0x1.9e9540300a1p-41, }, + + .inv_pi = 0x1.45f306dc9c883p-2, + .pi_1 = 0x1.921fb54442d18p+1, + .pi_2 = 0x1.1a62633145c06p-53, + .pi_3 = 0x1.c1cd129024e09p-106, + .shift = 0x1.8p52, + .range_val = 0x1p23, +}; + +#define C(i) sv_f64 (d->poly[i]) + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) +{ + return sv_call_f64 (sin, x, y, cmp); +} + +/* A fast SVE implementation of sin. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVsMxv_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVsMxv_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Load some values in quad-word chunks to minimise memory access. */ + const svbool_t ptrue = svptrue_b64 (); + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t inv_pi_and_pi1 = svld1rq (ptrue, &d->inv_pi); + svfloat64_t pi2_and_pi3 = svld1rq (ptrue, &d->pi_2); + + /* n = rint(|x|/pi). */ + svfloat64_t n = svmla_lane (shift, x, inv_pi_and_pi1, 0); + svuint64_t odd = svlsl_x (pg, svreinterpret_u64 (n), 63); + n = svsub_x (pg, n, shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/2 .. pi/2). */ + svfloat64_t r = x; + r = svmls_lane (r, n, inv_pi_and_pi1, 1); + r = svmls_lane (r, n, pi2_and_pi3, 0); + r = svmls_lane (r, n, pi2_and_pi3, 1); + + /* sin(r) poly approx. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r3 = svmul_x (pg, r2, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + + svfloat64_t t1 = svmla_x (pg, C (4), C (5), r2); + svfloat64_t t2 = svmla_x (pg, C (2), C (3), r2); + svfloat64_t t3 = svmla_x (pg, C (0), C (1), r2); + + svfloat64_t y = svmla_x (pg, t1, C (6), r4); + y = svmla_x (pg, t2, y, r4); + y = svmla_x (pg, t3, y, r4); + y = svmla_x (pg, r, y, r3); + + svbool_t cmp = svacle (pg, x, d->range_val); + cmp = svnot_z (pg, cmp); + if (unlikely (svptest_any (pg, cmp))) + return special_case (x, + svreinterpret_f64 (sveor_z ( + svnot_z (pg, cmp), svreinterpret_u64 (y), odd)), + cmp); + + /* Copy sign. */ + return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd)); +} + +TEST_SIG (SV, D, 1, sin, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (sin), 2.73) +TEST_DISABLE_FENV (SV_NAME_D1 (sin)) +TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sincos.c b/math/aarch64/sve/sincos.c new file mode 100644 index 000000000000..26b8bb3c6a5a --- /dev/null +++ b/math/aarch64/sve/sincos.c @@ -0,0 +1,73 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE + +#include "sv_math.h" +#include "sv_sincos_common.h" +#include "test_defs.h" + +#include <math.h> + +/* sincos not available for all scalar libm implementations. */ +#ifndef __GLIBC__ +static void +sincos (double x, double *out_sin, double *out_cos) +{ + *out_sin = sin (x); + *out_cos = cos (x); +} +#endif + +static void NOINLINE +special_case (svfloat64_t x, svbool_t special, double *out_sin, + double *out_cos) +{ + svbool_t p = svptrue_pat_b64 (SV_VL1); + for (int i = 0; i < svcntd (); i++) + { + if (svptest_any (special, p)) + sincos (svlastb (p, x), out_sin + i, out_cos + i); + p = svpnext_b64 (svptrue_b64 (), p); + } +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + sv_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +void +_ZGVsMxvl8l8_sincos (svfloat64_t x, double *out_sin, double *out_cos, + svbool_t pg) +{ + const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat64x2_t sc = sv_sincos_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); + + if (unlikely (svptest_any (pg, special))) + special_case (x, special, out_sin, out_cos); +} + +TEST_DISABLE_FENV (_ZGVsMxv_sincos_sin) +TEST_DISABLE_FENV (_ZGVsMxv_sincos_cos) +TEST_ULP (_ZGVsMxv_sincos_sin, 2.73) +TEST_ULP (_ZGVsMxv_sincos_cos, 2.73) +#define SV_SINCOS_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n) +SV_SINCOS_INTERVAL (0, 0x1p-63, 50000) +SV_SINCOS_INTERVAL (0x1p-63, 0x1p23, 500000) +SV_SINCOS_INTERVAL (0x1p23, inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sincosf.c b/math/aarch64/sve/sincosf.c new file mode 100644 index 000000000000..f3e956ee62e2 --- /dev/null +++ b/math/aarch64/sve/sincosf.c @@ -0,0 +1,74 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE + +#include "sv_math.h" +#include "sv_sincosf_common.h" +#include "test_defs.h" + +#include <math.h> + +/* sincos not available for all scalar libm implementations. */ +#ifndef __GLIBC__ +static void +sincosf (float x, float *out_sin, float *out_cos) +{ + *out_sin = sinf (x); + *out_cos = cosf (x); +} +#endif + +static void NOINLINE +special_case (svfloat32_t x, svbool_t special, float *out_sin, float *out_cos) +{ + svbool_t p = svptrue_pat_b32 (SV_VL1); + for (int i = 0; i < svcntw (); i++) + { + if (svptest_any (special, p)) + sincosf (svlastb (p, x), out_sin + i, out_cos + i); + p = svpnext_b32 (svptrue_b32 (), p); + } +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +void +_ZGVsMxvl4l4_sincosf (svfloat32_t x, float *out_sin, float *out_cos, + svbool_t pg) +{ + const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data); + svbool_t special = check_ge_rangeval (pg, x, d); + + svfloat32x2_t sc = sv_sincosf_inline (pg, x, d); + + svst1_f32 (pg, out_sin, svget2 (sc, 0)); + svst1_f32 (pg, out_cos, svget2 (sc, 1)); + + if (unlikely (svptest_any (pg, special))) + special_case (x, special, out_sin, out_cos); +} + +TEST_DISABLE_FENV (_ZGVsMxv_sincosf_sin) +TEST_DISABLE_FENV (_ZGVsMxv_sincosf_cos) +TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17) +TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31) +#define SV_SINCOSF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n) +SV_SINCOSF_INTERVAL (0, 0x1p-31, 50000) +SV_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000) +SV_SINCOSF_INTERVAL (0x1p20, inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sincospi.c b/math/aarch64/sve/sincospi.c new file mode 100644 index 000000000000..d06ca8cc4165 --- /dev/null +++ b/math/aarch64/sve/sincospi.c @@ -0,0 +1,47 @@ +/* + * Double-precision SVE sincospi(x, *y, *z) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_defs.h" +#include "mathlib.h" +#include "sv_sincospi_common.h" + +/* Double-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.09 ULP: + _ZGVsMxvl8l8_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1. + Worst-case error for sin is 3.16 ULP: + _ZGVsMxvl8l8_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. + */ +void +_ZGVsMxvl8l8_sincospi (svfloat64_t x, double *out_sin, double *out_cos, + svbool_t pg) +{ + const struct sv_sincospi_data *d = ptr_barrier (&sv_sincospi_data); + + svfloat64x2_t sc = sv_sincospi_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVsMxvl8l8_sincospi_sin) +TEST_DISABLE_FENV (_ZGVsMxvl8l8_sincospi_cos) +TEST_ULP (_ZGVsMxvl8l8_sincospi_sin, 2.59) +TEST_ULP (_ZGVsMxvl8l8_sincospi_cos, 2.66) +# define SV_SINCOSPI_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl8l8_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl8l8_sincospi_cos, lo, hi, n) +SV_SINCOSPI_INTERVAL (0, 0x1p-63, 10000) +SV_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000) +SV_SINCOSPI_INTERVAL (0.5, 0x1p53, 50000) +SV_SINCOSPI_INTERVAL (0x1p53, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sincospif.c b/math/aarch64/sve/sincospif.c new file mode 100644 index 000000000000..20476f9346e9 --- /dev/null +++ b/math/aarch64/sve/sincospif.c @@ -0,0 +1,46 @@ +/* + * Single-precision SVE sincospi(x, *y, *z) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_defs.h" +#include "mathlib.h" +#include "sv_sincospif_common.h" + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVsMxvl4l4_sincospif_sin(0x1.b51b8p-2) got 0x1.f28b5ep-1 want + 0x1.f28b58p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVsMxvl4l4_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want + 0x1.f7cd5p-1. */ +void +_ZGVsMxvl4l4_sincospif (svfloat32_t x, float *out_sin, float *out_cos, + svbool_t pg) +{ + const struct sv_sincospif_data *d = ptr_barrier (&sv_sincospif_data); + + svfloat32x2_t sc = sv_sincospif_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVsMxvl4l4_sincospif_sin) +TEST_DISABLE_FENV (_ZGVsMxvl4l4_sincospif_cos) +TEST_ULP (_ZGVsMxvl4l4_sincospif_sin, 2.54) +TEST_ULP (_ZGVsMxvl4l4_sincospif_cos, 2.68) +# define SV_SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl4l4_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl4l4_sincospif_cos, lo, hi, n) +SV_SINCOSPIF_INTERVAL (0, 0x1p-31, 10000) +SV_SINCOSPIF_INTERVAL (0x1p-31, 0.5, 50000) +SV_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000) +SV_SINCOSPIF_INTERVAL (0x1p31, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sinf.c b/math/aarch64/sve/sinf.c new file mode 100644 index 000000000000..62127194d60f --- /dev/null +++ b/math/aarch64/sve/sinf.c @@ -0,0 +1,95 @@ +/* + * Single-precision SVE sin(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float poly[4]; + /* Pi-related values to be loaded as one quad-word and used with + svmla_lane. */ + float negpi1, negpi2, negpi3, invpi; + float shift; +} data = { + .poly = { + /* Non-zero coefficients from the degree 9 Taylor series expansion of + sin. */ + -0x1.555548p-3f, 0x1.110df4p-7f, -0x1.9f42eap-13f, 0x1.5b2e76p-19f + }, + .negpi1 = -0x1.921fb6p+1f, + .negpi2 = 0x1.777a5cp-24f, + .negpi3 = 0x1.ee59dap-49f, + .invpi = 0x1.45f306p-2f, + .shift = 0x1.8p+23f +}; + +#define RangeVal 0x49800000 /* asuint32 (0x1p20f). */ +#define C(i) sv_f32 (d->poly[i]) + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) +{ + return sv_call_f32 (sinf, x, y, cmp); +} + +/* A fast SVE implementation of sinf. + Maximum error: 1.89 ULPs. + This maximum error is achieved at multiple values in [-2^18, 2^18] + but one example is: + SV_NAME_F1 (sin)(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */ +svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svbool_t cmp = svcmpge (pg, svreinterpret_u32 (ax), RangeVal); + + /* pi_vals are a quad-word of helper values - the first 3 elements contain + -pi in extended precision, the last contains 1 / pi. */ + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->negpi1); + + /* n = rint(|x|/pi). */ + svfloat32_t n = svmla_lane (sv_f32 (d->shift), ax, pi_vals, 3); + svuint32_t odd = svlsl_x (pg, svreinterpret_u32 (n), 31); + n = svsub_x (pg, n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + svfloat32_t r; + r = svmla_lane (ax, n, pi_vals, 0); + r = svmla_lane (r, n, pi_vals, 1); + r = svmla_lane (r, n, pi_vals, 2); + + /* sin(r) approx using a degree 9 polynomial from the Taylor series + expansion. Note that only the odd terms of this are non-zero. */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y; + y = svmla_x (pg, C (2), r2, C (3)); + y = svmla_x (pg, C (1), r2, y); + y = svmla_x (pg, C (0), r2, y); + y = svmla_x (pg, r, r, svmul_x (pg, y, r2)); + + /* sign = y^sign^odd. */ + sign = sveor_x (pg, sign, odd); + + if (unlikely (svptest_any (pg, cmp))) + return special_case (x, + svreinterpret_f32 (sveor_x ( + svnot_z (pg, cmp), svreinterpret_u32 (y), sign)), + cmp); + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +TEST_SIG (SV, F, 1, sin, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (sin), 1.40) +TEST_DISABLE_FENV (SV_NAME_F1 (sin)) +TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sinh.c b/math/aarch64/sve/sinh.c new file mode 100644 index 000000000000..8a35c1c38525 --- /dev/null +++ b/math/aarch64/sve/sinh.c @@ -0,0 +1,105 @@ +/* + * Double-precision SVE sinh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64_t poly[11]; + float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift; + uint64_t halff; + int64_t onef; + uint64_t large_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .inv_ln2 = 0x1.71547652b82fep0, + .m_ln2_hi = -0x1.62e42fefa39efp-1, + .m_ln2_lo = -0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, + + .halff = 0x3fe0000000000000, + .onef = 0x3ff0000000000000, + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = 0x4080000000000000, +}; + +static inline svfloat64_t +expm1_inline (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Reduce argument: + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where i = round(x / ln2) + and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ + svfloat64_t j + = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); + svint64_t i = svcvt_s64_x (pg, j); + svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi); + f = svmla_x (pg, f, j, d->m_ln2_lo); + /* Approximate expm1(f) using polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t f8 = svmul_x (pg, f4, f4); + svfloat64_t p + = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); + /* t = 2^i. */ + svfloat64_t t = svscale_x (pg, sv_f64 (1), i); + /* expm1(x) ~= p * t + (t - 1). */ + return svmla_x (pg, svsub_x (pg, t, 1.0), p, t); +} + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svbool_t pg) +{ + return sv_call_f64 (sinh, x, x, pg); +} + +/* Approximation for SVE double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.57 ULP: + _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2 + want 0x1.ab929fc64bd63p-2. */ +svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff)); + + svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound); + + /* Fall back to scalar variant for all lanes if any are special. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + svfloat64_t t = expm1_inline (ax, pg); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + return svmul_x (pg, t, halfsign); +} + +TEST_SIG (SV, D, 1, sinh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (sinh), 2.08) +TEST_DISABLE_FENV (SV_NAME_D1 (sinh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sinhf.c b/math/aarch64/sve/sinhf.c new file mode 100644 index 000000000000..82b7ee442780 --- /dev/null +++ b/math/aarch64/sve/sinhf.c @@ -0,0 +1,65 @@ +/* + * Single-precision SVE sinh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expm1f_inline.h" + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t halff, large_bound; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + .halff = 0x3f000000, + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .large_bound = 0x42b0c0a7, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +{ + return sv_call_f32 (sinhf, x, y, pg); +} + +/* Approximation for SVE single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff)); + + svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound); + + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us to + retain acceptable accuracy for very small inputs. */ + svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts); + t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + + /* Fall back to the scalar variant for any lanes which would cause + expm1f to overflow. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, svmul_x (pg, t, halfsign), special); + + return svmul_x (svptrue_b32 (), t, halfsign); +} + +TEST_SIG (SV, F, 1, sinh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (sinh), 1.76) +TEST_DISABLE_FENV (SV_NAME_F1 (sinh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sinpi.c b/math/aarch64/sve/sinpi.c new file mode 100644 index 000000000000..8fad3678b172 --- /dev/null +++ b/math/aarch64/sve/sinpi.c @@ -0,0 +1,62 @@ +/* + * Double-precision SVE sinpi(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" + +static const struct data +{ + double poly[10], range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p63, +}; + +/* A fast SVE implementation of sinpi. + Maximum error 3.10 ULP: + _ZGVsMxv_sinpi(0x1.df1a14f1b235p-2) got 0x1.fd64f541606cp-1 + want 0x1.fd64f541606c3p-1. */ +svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2) + with n = rint(x) and r = r - n. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_D1 (sinpi), 2.61) +TEST_DISABLE_FENV (SV_NAME_D1 (sinpi)) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sinpif.c b/math/aarch64/sve/sinpif.c new file mode 100644 index 000000000000..b91768a29cb6 --- /dev/null +++ b/math/aarch64/sve/sinpif.c @@ -0,0 +1,58 @@ +/* + * Single-precision SVE sinpi(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +static const struct data +{ + float poly[6], range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31, +}; + +/* A fast SVE implementation of sinpif. + Maximum error 2.48 ULP: + _ZGVsMxv_sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2 + with n = rint(x) and r = r - n. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_z (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_F1 (sinpi), 1.99) +TEST_DISABLE_FENV (SV_NAME_F1 (sinpi)) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sv_expf_inline.h b/math/aarch64/sve/sv_expf_inline.h new file mode 100644 index 000000000000..6054e65bb202 --- /dev/null +++ b/math/aarch64/sve/sv_expf_inline.h @@ -0,0 +1,66 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) and do + * not need special-case handling + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_SV_EXPF_INLINE_H +#define MATH_SV_EXPF_INLINE_H + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +struct sv_expf_data +{ + float c1, c3, inv_ln2; + float ln2_lo, c0, c2, c4; + float ln2_hi, shift; +}; + +/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ +#define SV_EXPF_DATA \ + { \ + /* Coefficients copied from the polynomial in AdvSIMD variant. */ \ + .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \ + .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .shift = 0x1.803f8p17f, \ + } + +#define C(i) sv_f32 (d->poly[i]) + +static inline svfloat32_t +expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +{ + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo); + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift); + svfloat32_t n = svsub_x (pg, z, d->shift); + + /* r = x - n*ln2/N. */ + svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x); + r = svmls_lane (r, n, lane_consts, 0); + + /* scale = 2^(n/N). */ + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +#endif // MATH_SV_EXPF_INLINE_H diff --git a/math/aarch64/sve/sv_expm1f_inline.h b/math/aarch64/sve/sv_expm1f_inline.h new file mode 100644 index 000000000000..35892f519690 --- /dev/null +++ b/math/aarch64/sve/sv_expm1f_inline.h @@ -0,0 +1,69 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) - 1 and do + * not need special-case handling + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_SV_EXPM1F_INLINE_H +#define MATH_SV_EXPM1F_INLINE_H + +#include "sv_math.h" + +struct sv_expm1f_data +{ + /* These 4 are grouped together so they can be loaded as one quadword, then + used with _lane forms of svmla/svmls. */ + float32_t c2, c4, ln2_hi, ln2_lo; + float c0, inv_ln2, c1, c3, special_bound; +}; + +/* Coefficients generated using fpminimax. */ +#define SV_EXPM1F_DATA \ + { \ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \ + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \ + \ + .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \ + } + +static inline svfloat32_t +expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) +{ + /* This vector is reliant on layout of data - it contains constants + that can be used with _lane forms of svmla/svmls. Values are: + [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */ + svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); + j = svrinta_x (pg, j); + + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); + f = svmls_lane (f, j, lane_constants, 3); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); + svfloat32_t p = svmla_x (pg, p12, f2, p34); + p = svmla_x (pg, sv_f32 (d->c0), f, p); + p = svmla_x (pg, f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); +} + +#endif // MATH_SV_EXPM1F_INLINE_H diff --git a/math/aarch64/sve/sv_log1p_inline.h b/math/aarch64/sve/sv_log1p_inline.h new file mode 100644 index 000000000000..86a5bb1456f6 --- /dev/null +++ b/math/aarch64/sve/sv_log1p_inline.h @@ -0,0 +1,96 @@ +/* + * Helper for SVE double-precision routines which calculate log(1 + x) and do + * not need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef MATH_SV_LOG1P_INLINE_H +#define MATH_SV_LOG1P_INLINE_H + +#include "sv_math.h" +#include "sv_poly_f64.h" + +static const struct sv_log1p_data +{ + double poly[19], ln2[2]; + uint64_t hf_rt2_top; + uint64_t one_m_hf_rt2_top; + uint32_t bottom_mask; + int64_t one_top; +} sv_log1p_data = { + /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. + */ + .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6 }, + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, + .hf_rt2_top = 0x3fe6a09e00000000, + .one_m_hf_rt2_top = 0x00095f6200000000, + .bottom_mask = 0xffffffff, + .one_top = 0x3ff +}; + +static inline svfloat64_t +sv_log1p_inline (svfloat64_t x, const svbool_t pg) +{ + /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which + differs from v_log1p_2u5.c by: + - No special-case handling - this should be dealt with by the caller. + - Pairwise Horner polynomial evaluation for improved accuracy. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using svsel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. + See sv_log1p_2u1.c for details of the algorithm. */ + const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data); + svfloat64_t m = svadd_x (pg, x, 1); + svuint64_t mi = svreinterpret_u64 (m); + svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top); + + svint64_t ki + = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top); + svfloat64_t k = svcvt_f64_x (pg, ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + svuint64_t utop + = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top); + svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask)); + svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + + /* Correction term c/m. */ + svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1)); + svfloat64_t cm; + +#ifndef WANT_SV_LOG1P_K0_SHORTCUT +# error \ + "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_SV_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + svbool_t knot0 = svcmpne (pg, k, 0); + cm = svdiv_z (knot0, c, m); + if (likely (!svptest_any (pg, knot0))) + { + f = svsel (knot0, f, x); + } +#else + /* No shortcut. */ + cm = svdiv_x (pg, c, m); +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]); + svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]); + + return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); +} +#endif // MATH_SV_LOG1P_INLINE_H diff --git a/math/aarch64/sve/sv_log1pf_inline.h b/math/aarch64/sve/sv_log1pf_inline.h new file mode 100644 index 000000000000..238079c61a5b --- /dev/null +++ b/math/aarch64/sve/sv_log1pf_inline.h @@ -0,0 +1,83 @@ +/* + * Helper for SVE routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_SV_LOG1PF_INLINE_H +#define MATH_SV_LOG1PF_INLINE_H + +#define SignExponentMask 0xff800000 + +static const struct sv_log1pf_data +{ + float c0, c2, c4, c6; + float c1, c3, c5, c7; + float ln2, exp_bias, quarter; + uint32_t four, three_quarters; +} sv_log1pf_data = { + /* Do not store first term of polynomial, which is -0.5, as + this can be fmov-ed directly instead of including it in + the main load-and-mla polynomial schedule. */ + .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f, + .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f, + .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f, + .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000, + .three_quarters = 0x3f400000, +}; + +static inline svfloat32_t +sv_log1pf_inline (svfloat32_t x, svbool_t pg) +{ + const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + svfloat32_t m = svadd_x (pg, x, 1); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + svint32_t k + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), + sv_s32 (SignExponentMask)); + + /* Scale x by exponent manipulation. */ + svfloat32_t m_scale = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); + svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2); + m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2)); + + /* Evaluate polynomial on reduced interval. */ + svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale); + + svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1); + svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0); + svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1); + svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2); + svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3); + + svfloat32_t p = svmla_x (pg, p45, p67, ms2); + p = svmla_x (pg, p23, p, ms2); + p = svmla_x (pg, p01, p, ms2); + + p = svmad_x (pg, m_scale, p, -0.5); + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1); + return svmla_lane_f32 (p, scale_back, fconst, 0); +} + +#endif // SV_LOG1PF_INLINE_H diff --git a/math/aarch64/sve/sv_log_inline.h b/math/aarch64/sve/sv_log_inline.h new file mode 100644 index 000000000000..a1b169a0b727 --- /dev/null +++ b/math/aarch64/sve/sv_log_inline.h @@ -0,0 +1,83 @@ +/* + * Double-precision vector log(x) function - inline version + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "math_config.h" + +#ifndef SV_LOG_INLINE_POLY_ORDER +# error Cannot use inline log helper without specifying poly order (options are 4 or 5) +#endif + +#if SV_LOG_INLINE_POLY_ORDER == 4 +# define POLY \ + { \ + -0x1.ffffffffcbad3p-2, 0x1.555555578ed68p-2, -0x1.0000d3a1e7055p-2, \ + 0x1.999392d02a63ep-3 \ + } +#elif SV_LOG_INLINE_POLY_ORDER == 5 +# define POLY \ + { \ + -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, \ + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 \ + } +#else +# error Can only choose order 4 or 5 for log poly +#endif + +struct sv_log_inline_data +{ + double poly[SV_LOG_INLINE_POLY_ORDER]; + double ln2; + uint64_t off, sign_exp_mask; +}; + +#define SV_LOG_CONSTANTS \ + { \ + .poly = POLY, .ln2 = 0x1.62e42fefa39efp-1, \ + .sign_exp_mask = 0xfff0000000000000, .off = 0x3fe6900900000000 \ + } + +#define P(i) sv_f64 (d->poly[i]) +#define N (1 << V_LOG_TABLE_BITS) + +static inline svfloat64_t +sv_log_inline (svbool_t pg, svfloat64_t x, const struct sv_log_inline_data *d) +{ + svuint64_t ix = svreinterpret_u64 (x); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svint64_t k + = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + + /* Lookup in 2 global lists (length N). */ + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t kd = svcvt_f64_x (pg, k); + /* hi = r + log(c) + k*Ln2. */ + svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, P (2), r, P (3)); + svfloat64_t p = svmla_x (pg, P (0), r, P (1)); +#if SV_LOG_INLINE_POLY_ORDER == 5 + y = svmla_x (pg, P (4), r2); +#endif + y = svmla_x (pg, p, r2, y); + return svmla_x (pg, hi, r2, y); +} diff --git a/math/aarch64/sve/sv_math.h b/math/aarch64/sve/sv_math.h new file mode 100644 index 000000000000..db688a893032 --- /dev/null +++ b/math/aarch64/sve/sv_math.h @@ -0,0 +1,145 @@ +/* + * Wrapper functions for SVE ACLE. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef SV_MATH_H +#define SV_MATH_H + +/* Enable SVE in this translation unit. Note, because this is 'pushed' in + clang, any file including sv_math.h will have to pop it back off again by + ending the source file with CLOSE_SVE_ATTR. It is important that sv_math.h + is included first so that all functions have the target attribute. */ +#ifdef __clang__ +# pragma clang attribute push(__attribute__((target("sve"))), \ + apply_to = any(function)) +# define CLOSE_SVE_ATTR _Pragma("clang attribute pop") +#else +# pragma GCC target("+sve") +# define CLOSE_SVE_ATTR +#endif + +#include <arm_sve.h> +#include <stdbool.h> + +#include "math_config.h" + +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f +#define SV_NAME_D1(fun) _ZGVsMxv_##fun +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun +#define SV_NAME_F1_L1(fun) _ZGVsMxvl4_##fun##f +#define SV_NAME_D1_L1(fun) _ZGVsMxvl8_##fun +#define SV_NAME_F1_L2(fun) _ZGVsMxvl4l4_##fun##f + +/* Double precision. */ +static inline svint64_t +sv_s64 (int64_t x) +{ + return svdup_s64 (x); +} + +static inline svuint64_t +sv_u64 (uint64_t x) +{ + return svdup_u64 (x); +} + +static inline svfloat64_t +sv_f64 (double x) +{ + return svdup_f64 (x); +} + +static inline svfloat64_t +sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double elem = svclastb (p, 0, x); + elem = (*f) (elem); + svfloat64_t y2 = sv_f64 (elem); + y = svsel (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline svfloat64_t +sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double elem1 = svclastb (p, 0, x1); + double elem2 = svclastb (p, 0, x2); + double ret = (*f) (elem1, elem2); + svfloat64_t y2 = sv_f64 (ret); + y = svsel (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline svuint64_t +sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y) +{ + svuint64_t q = svdiv_x (pg, x, y); + return svmls_x (pg, x, q, y); +} + +/* Single precision. */ +static inline svint32_t +sv_s32 (int32_t x) +{ + return svdup_s32 (x); +} + +static inline svuint32_t +sv_u32 (uint32_t x) +{ + return svdup_u32 (x); +} + +static inline svfloat32_t +sv_f32 (float x) +{ + return svdup_f32 (x); +} + +static inline svfloat32_t +sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float elem = svclastb (p, 0, x); + elem = (*f) (elem); + svfloat32_t y2 = sv_f32 (elem); + y = svsel (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +static inline svfloat32_t +sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, + svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float elem1 = svclastb (p, 0, x1); + float elem2 = svclastb (p, 0, x2); + float ret = (*f) (elem1, elem2); + svfloat32_t y2 = sv_f32 (ret); + y = svsel (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} +#endif diff --git a/math/aarch64/sve/sv_poly_f32.h b/math/aarch64/sve/sv_poly_f32.h new file mode 100644 index 000000000000..2d73014a4b45 --- /dev/null +++ b/math/aarch64/sve/sv_poly_f32.h @@ -0,0 +1,26 @@ +/* + * Helpers for evaluating polynomials on single-precision SVE input, using + * various schemes. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_POLY_SVE_F32_H +#define MATH_POLY_SVE_F32_H + +#include <arm_sve.h> + +/* Wrap SVE f32 helpers: evaluation of some scheme/order has form: + sv_[scheme]_[order]_f32_x. */ +#define VTYPE svfloat32_t +#define STYPE float +#define VWRAP(f) sv_##f##_f32_x +#define DUP svdup_f32 +#include "sv_poly_generic.h" +#undef DUP +#undef VWRAP +#undef STYPE +#undef VTYPE + +#endif diff --git a/math/aarch64/sve/sv_poly_f64.h b/math/aarch64/sve/sv_poly_f64.h new file mode 100644 index 000000000000..f92be9bf8e9c --- /dev/null +++ b/math/aarch64/sve/sv_poly_f64.h @@ -0,0 +1,26 @@ +/* + * Helpers for evaluating polynomials on double-precision SVE input, using + * various schemes. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_POLY_SVE_F64_H +#define MATH_POLY_SVE_F64_H + +#include <arm_sve.h> + +/* Wrap SVE f64 helpers: evaluation of some scheme/order has form: + sv_[scheme]_[order]_f64_x. */ +#define VTYPE svfloat64_t +#define STYPE double +#define VWRAP(f) sv_##f##_f64_x +#define DUP svdup_f64 +#include "sv_poly_generic.h" +#undef DUP +#undef VWRAP +#undef STYPE +#undef VTYPE + +#endif diff --git a/math/aarch64/sve/sv_poly_generic.h b/math/aarch64/sve/sv_poly_generic.h new file mode 100644 index 000000000000..a1fc59baa8d3 --- /dev/null +++ b/math/aarch64/sve/sv_poly_generic.h @@ -0,0 +1,331 @@ +/* + * Helpers for evaluating polynomials with various schemes - specific to SVE + * but precision-agnostic. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef VTYPE +# error Cannot use poly_generic without defining VTYPE +#endif +#ifndef STYPE +# error Cannot use poly_generic without defining STYPE +#endif +#ifndef VWRAP +# error Cannot use poly_generic without defining VWRAP +#endif +#ifndef DUP +# error Cannot use poly_generic without defining DUP +#endif + +static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + /* At order 3, Estrin and Pairwise Horner are identical. */ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + return svmla_x (pg, p01, p23, x2); +} + +static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + return svmla_x (pg, p03, x4, poly[4]); +} +static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + return svmla_x (pg, p03, p45, x4); +} +static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + VTYPE p46 = svmla_x (pg, p45, x, poly[6]); + return svmla_x (pg, p03, p46, x4); +} +static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + const STYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); + VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4); + return svmla_x (pg, p03, p47, x4); +} +static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]); +} +static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, + VTYPE x8, const STYPE *poly) +{ + VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8); +} +static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); + VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8); +} +static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8); + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8); +} +static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), + VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8); +} +static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16, + poly[16]); +} +static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17, + x16); +} +static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); + VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]); + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18, + x16); +} +static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2, + VTYPE x4, VTYPE x8, VTYPE x16, + const STYPE *poly) +{ + return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), + VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16); +} + +static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]); + p = svmad_x (pg, x, p, poly[1]); + p = svmad_x (pg, x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]); + p = svmad_x (pg, x, p, poly[2]); + p = svmad_x (pg, x, p, poly[1]); + p = svmad_x (pg, x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]); +} +static inline VTYPE +sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) +{ + return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]); +} + +static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + VTYPE p; + p = svmla_x (pg, p23, x2, poly[4]); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); + VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); + VTYPE p; + p = svmla_x (pg, p23, x2, p45); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p26); +} +static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p27); +} +static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p28); +} +static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p29); +} +static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_10); +} +static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_11); +} +static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_12); +} +static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_13); +} +static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_14); +} +static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_15); +} +static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_16); +} +static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_17); +} +static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly) +{ + VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2); + VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); + return svmla_x (pg, p01, x2, p2_18); +} + +static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly_even, + const STYPE *poly_odd) +{ + VTYPE c13 = svld1rq (pg, poly_odd); + + VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0); + VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1); + VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]); + + VTYPE p; + p = svmla_x (pg, p23, x2, p45); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly_even, + const STYPE *poly_odd) +{ + VTYPE c13 = svld1rq (pg, poly_odd); + + VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2); + VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1); + + VTYPE p29 = svmla_x (pg, p23, x2, p49); + VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0); + + return svmla_x (pg, p01, x2, p29); +} diff --git a/math/aarch64/sve/sv_sincos_common.h b/math/aarch64/sve/sv_sincos_common.h new file mode 100644 index 000000000000..2a537da157b0 --- /dev/null +++ b/math/aarch64/sve/sv_sincos_common.h @@ -0,0 +1,85 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" + +static const struct sv_sincos_data +{ + double sin_poly[7], cos_poly[6], pio2[3]; + double inv_pio2, shift, range_val; +} sv_sincos_data = { + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2 = { 0x1.921fb50000000p+0, 0x1.110b460000000p-26, + 0x1.1a62633145c07p-54 }, + .shift = 0x1.8p52, + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, + -0x1.a01a019936f27p-13, 0x1.71de37a97d93ep-19, + -0x1.ae633919987c6p-26, 0x1.60e277ae07cecp-33, + -0x1.9e9540300a1p-41 }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + 0x1.555555555554cp-5, -0x1.6c16c16c1521fp-10, + 0x1.a01a019cbf62ap-16, -0x1.27e4f812b681ep-22, + 0x1.1ee9f152a57cdp-29, -0x1.8fb131098404bp-37 }, + .range_val = 0x1p23, }; + +static inline svbool_t +check_ge_rangeval (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d) +{ + svbool_t in_bounds = svaclt (pg, x, d->range_val); + return svnot_z (pg, in_bounds); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline svfloat64x2_t +sv_sincos_inline (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t q = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_pio2), + d->shift); + svint64_t n = svcvt_s64_x (pg, q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + svfloat64_t r = x; + r = svmls_x (pg, r, q, d->pio2[0]); + r = svmls_x (pg, r, q, d->pio2[1]); + r = svmls_x (pg, r, q, d->pio2[2]); + + svfloat64_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r2, r), + r4 = svmul_x (pg, r2, r2); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + svfloat64_t s = sv_pw_horner_6_f64_x (pg, r2, r4, d->sin_poly); + s = svmla_x (pg, r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + svfloat64_t c = sv_pw_horner_5_f64_x (pg, r2, r4, d->cos_poly); + c = svmad_x (pg, c, r2, -0.5); + c = svmad_x (pg, c, r2, 1); + + svuint64_t un = svreinterpret_u64 (n); + /* If odd quadrant, swap cos and sin. */ + svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 63), 0); + svfloat64_t ss = svsel (swap, s, c); + svfloat64_t cc = svsel (swap, c, s); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + svuint64_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 62); + svuint64_t cos_sign = svlsl_x ( + pg, svand_x (pg, svreinterpret_u64 (svadd_x (pg, n, 1)), 2), 62); + ss = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ss), sin_sign)); + cc = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (cc), cos_sign)); + + return svcreate2 (ss, cc); +} diff --git a/math/aarch64/sve/sv_sincosf_common.h b/math/aarch64/sve/sv_sincosf_common.h new file mode 100644 index 000000000000..bda89ed24680 --- /dev/null +++ b/math/aarch64/sve/sv_sincosf_common.h @@ -0,0 +1,81 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +const static struct sv_sincosf_data +{ + float poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} sv_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + -0x1.555546p-3, 0x1.11076p-7, -0x1.994eb4p-13 }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + 0x1.55554ap-5, -0x1.6c0c1ap-10, 0x1.99e0eep-16 }, + .pio2 = { 0x1.921fb6p+0f, -0x1.777a5cp-25f, -0x1.ee59dap-50f }, + .inv_pio2 = 0x1.45f306p-1f, + .shift = 0x1.8p23, + .range_val = 0x1p20 +}; + +static inline svbool_t +check_ge_rangeval (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d) +{ + svbool_t in_bounds = svaclt (pg, x, d->range_val); + return svnot_z (pg, in_bounds); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline svfloat32x2_t +sv_sincosf_inline (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + svfloat32_t q = svmla_x (pg, sv_f32 (d->shift), x, d->inv_pio2); + q = svsub_x (pg, q, d->shift); + svint32_t n = svcvt_s32_x (pg, q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + svfloat32_t r = x; + r = svmls_x (pg, r, q, d->pio2[0]); + r = svmls_x (pg, r, q, d->pio2[1]); + r = svmls_x (pg, r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + svfloat32_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r, r2); + svfloat32_t s = svmla_x (pg, sv_f32 (d->poly_sin[1]), r2, d->poly_sin[2]); + s = svmad_x (pg, r2, s, d->poly_sin[0]); + s = svmla_x (pg, r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t p = svmla_x (pg, sv_f32 (d->poly_cos[1]), r2, d->poly_cos[2]); + svfloat32_t c = svmad_x (pg, sv_f32 (d->poly_cos[0]), r2, -0.5); + c = svmla_x (pg, c, r4, p); + c = svmad_x (pg, r2, c, 1); + + svuint32_t un = svreinterpret_u32 (n); + /* If odd quadrant, swap cos and sin. */ + svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 31), 0); + svfloat32_t ss = svsel (swap, s, c); + svfloat32_t cc = svsel (swap, c, s); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + svuint32_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 30); + svuint32_t cos_sign = svlsl_x ( + pg, svand_x (pg, svreinterpret_u32 (svadd_x (pg, n, 1)), 2), 30); + ss = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ss), sin_sign)); + cc = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (cc), cos_sign)); + + return svcreate2 (ss, cc); +} diff --git a/math/aarch64/sve/sv_sincospi_common.h b/math/aarch64/sve/sv_sincospi_common.h new file mode 100644 index 000000000000..672ebbc8e855 --- /dev/null +++ b/math/aarch64/sve/sv_sincospi_common.h @@ -0,0 +1,76 @@ +/* + * Core approximation for double-precision SVE sincospi + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" + +static const struct sv_sincospi_data +{ + double c0, c2, c4, c6, c8; + double c1, c3, c5, c7, c9; + double range_val; +} sv_sincospi_data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .c0 = 0x1.921fb54442d184p1, + .c1 = -0x1.4abbce625be53p2, + .c2 = 0x1.466bc6775ab16p1, + .c3 = -0x1.32d2cce62dc33p-1, + .c4 = 0x1.507834891188ep-4, + .c5 = -0x1.e30750a28c88ep-8, + .c6 = 0x1.e8f48308acda4p-12, + .c7 = -0x1.6fc0032b3c29fp-16, + .c8 = 0x1.af86ae521260bp-21, + .c9 = -0x1.012a9870eeb7dp-25, + /* Exclusive upper bound for a signed integer. */ + .range_val = 0x1p63 +}; + +/* Double-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.09 ULP: + _ZGVsMxvl8l8_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1. + Worst-case error for cos is 3.16 ULP: + _ZGVsMxvl8l8_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. + */ +static inline svfloat64x2_t +sv_sincospi_inline (svbool_t pg, svfloat64_t x, + const struct sv_sincospi_data *d) +{ + const svbool_t pt = svptrue_b64 (); + + /* r = x - rint(x). */ + /* pt hints unpredicated instruction. */ + svfloat64_t rx = svrinta_x (pg, x); + svfloat64_t sr = svsub_x (pt, x, rx); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + svfloat64_t cr = svsubr_x (pg, svabs_x (pg, sr), 0.5); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + /* pt hints unpredicated instruction. */ + svfloat64_t sr2 = svmul_x (pt, sr, sr); + svfloat64_t cr2 = svmul_x (pt, cr, cr); + svfloat64_t sr4 = svmul_x (pt, sr2, sr2); + svfloat64_t cr4 = svmul_x (pt, cr2, cr2); + + /* If rint(x) is odd, the sign of the result should be inverted for sinpi and + re-introduced for cospi. cmp filters rxs that saturate to max sint. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t odd = svlsl_x (pt, svreinterpret_u64 (svcvt_s64_z (pg, rx)), 63); + sr = svreinterpret_f64 (sveor_x (pt, svreinterpret_u64 (sr), odd)); + cr = svreinterpret_f64 (sveor_m (cmp, svreinterpret_u64 (cr), odd)); + + svfloat64_t sinpix = svmul_x ( + pt, sv_lw_pw_horner_9_f64_x (pg, sr2, sr4, &(d->c0), &(d->c1)), sr); + svfloat64_t cospix = svmul_x ( + pt, sv_lw_pw_horner_9_f64_x (pg, cr2, cr4, &(d->c0), &(d->c1)), cr); + + return svcreate2 (sinpix, cospix); +} diff --git a/math/aarch64/sve/sv_sincospif_common.h b/math/aarch64/sve/sv_sincospif_common.h new file mode 100644 index 000000000000..4b9101de74ed --- /dev/null +++ b/math/aarch64/sve/sv_sincospif_common.h @@ -0,0 +1,82 @@ +/* + * Helper for single-precision SVE sincospi + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f32.h" + +const static struct sv_sincospif_data +{ + float c0, c2, c4; + float c1, c3, c5; + float range_val; +} sv_sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .c0 = 0x1.921fb6p1f, + .c1 = -0x1.4abbcep2f, + .c2 = 0x1.466bc6p1f, + .c3 = -0x1.32d2ccp-1f, + .c4 = 0x1.50783p-4f, + .c5 = -0x1.e30750p-8f, + /* Exclusive upper bound for a signed integer. */ + .range_val = 0x1p31f, +}; + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVsMxvl4l4_sincospif_sin(0x1.b51b8p-2) got 0x1.f28b5ep-1 want + 0x1.f28b58p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVsMxvl4l4_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want + 0x1.f7cd5p-1. */ +static inline svfloat32x2_t +sv_sincospif_inline (svbool_t pg, svfloat32_t x, + const struct sv_sincospif_data *d) +{ + const svbool_t pt = svptrue_b32 (); + + /* r = x - rint(x). */ + svfloat32_t rx = svrinta_x (pg, x); + svfloat32_t sr = svsub_x (pt, x, rx); + + /* cospi(x) = sinpi(0.5 - abs(r)) for values -1/2 .. 1/2. */ + svfloat32_t cr = svsubr_x (pt, svabs_x (pg, sr), 0.5f); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + svfloat32_t sr2 = svmul_x (pt, sr, sr); + svfloat32_t sr4 = svmul_x (pt, sr2, sr2); + svfloat32_t cr2 = svmul_x (pt, cr, cr); + svfloat32_t cr4 = svmul_x (pt, cr2, cr2); + + /* If rint(x) is odd, the sign of the result should be inverted for sinpi and + re-introduced for cospi. cmp filters rxs that saturate to max sint. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t odd = svlsl_x (pt, svreinterpret_u32 (svcvt_s32_z (pg, rx)), 31); + sr = svreinterpret_f32 (sveor_x (pt, svreinterpret_u32 (sr), odd)); + cr = svreinterpret_f32 (sveor_m (cmp, svreinterpret_u32 (cr), odd)); + + svfloat32_t c135 = svld1rq_f32 (svptrue_b32 (), &d->c1); + + svfloat32_t sp01 = svmla_lane (sv_f32 (d->c0), sr2, c135, 0); + svfloat32_t sp23 = svmla_lane (sv_f32 (d->c2), sr2, c135, 1); + svfloat32_t sp45 = svmla_lane (sv_f32 (d->c4), sr2, c135, 2); + + svfloat32_t cp01 = svmla_lane (sv_f32 (d->c0), cr2, c135, 0); + svfloat32_t cp23 = svmla_lane (sv_f32 (d->c2), cr2, c135, 1); + svfloat32_t cp45 = svmla_lane (sv_f32 (d->c4), cr2, c135, 2); + + svfloat32_t sp = svmla_x (pg, sp23, sr4, sp45); + svfloat32_t cp = svmla_x (pg, cp23, cr4, cp45); + + sp = svmla_x (pg, sp01, sr4, sp); + cp = svmla_x (pg, cp01, cr4, cp); + + svfloat32_t sinpix = svmul_x (pt, sp, sr); + svfloat32_t cospix = svmul_x (pt, cp, cr); + + return svcreate2 (sinpix, cospix); +} diff --git a/math/aarch64/sve/tan.c b/math/aarch64/sve/tan.c new file mode 100644 index 000000000000..1dfc5c422d5e --- /dev/null +++ b/math/aarch64/sve/tan.c @@ -0,0 +1,131 @@ +/* + * Double-precision SVE tan(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double c2, c4, c6, c8; + double poly_1357[4]; + double c0, inv_half_pi; + double half_pi_hi, half_pi_lo, range_val; +} data = { + /* Polynomial generated with FPMinimax. */ + .c2 = 0x1.ba1ba1bb46414p-5, + .c4 = 0x1.226e5e5ecdfa3p-7, + .c6 = 0x1.7ea75d05b583ep-10, + .c8 = 0x1.4e4fd14147622p-12, + .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6, + 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 }, + .c0 = 0x1.5555555555556p-2, + .inv_half_pi = 0x1.45f306dc9c883p-1, + .half_pi_hi = 0x1.921fb54442d18p0, + .half_pi_lo = 0x1.1a62633145c07p-54, + .range_val = 0x1p23, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg, + svbool_t special) +{ + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d); + return sv_call_f64 (tan, x, y, special); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0); + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t q = svmul_lane (x, half_pi_c0, 1); + q = svrinta_x (pg, q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + svfloat64_t r = x; + svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi); + r = svmls_lane (r, q, half_pi, 0); + r = svmls_lane (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = svmul_x (svptrue_b64 (), r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2); + svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4); + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2); + svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6); + + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0); + svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1); + svfloat64_t p03 = svmla_x (pg, p01, p23, r4); + + svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1); + svfloat64_t p47 = svmla_x (pg, p45, p67, r4); + + svfloat64_t p = svmla_x (pg, p03, p47, r8); + + svfloat64_t z = svmul_x (svptrue_b64 (), p, r); + z = svmul_x (svptrue_b64 (), r2, z); + z = svmla_lane (z, r, half_pi_c0, 0); + p = svmla_x (pg, r, r2, z); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator dependent on odd/even-ness of q (quadrant). */ + + /* Invert condition to catch NaNs and Infs as well as large values. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); + + if (unlikely (svptest_any (pg, special))) + { + return special_case (x, p, q, pg, special); + } + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + return svdiv_x (pg, n, d); +} + +TEST_SIG (SV, D, 1, tan, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (tan), 2.99) +TEST_DISABLE_FENV (SV_NAME_D1 (tan)) +TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanf.c b/math/aarch64/sve/tanf.c new file mode 100644 index 000000000000..d34fc2fc1a4e --- /dev/null +++ b/math/aarch64/sve/tanf.c @@ -0,0 +1,117 @@ +/* + * Single-precision vector tan(x) function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float pio2_1, pio2_2, pio2_3, invpio2; + float c1, c3, c5; + float c0, c2, c4, range_val, shift; +} data = { + /* Coefficients generated using: + poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), + deg, + [|single ...|], + [a*a;b*b]); + optimize relative error + final prec : 23 bits + deg : 5 + a : 0x1p-126 ^ 2 + b : ((pi) / 0x1p2) ^ 2 + dirty rel error: 0x1.f7c2e4p-25 + dirty abs error: 0x1.f7c2ecp-25. */ + .c0 = 0x1.55555p-2, .c1 = 0x1.11166p-3, + .c2 = 0x1.b88a78p-5, .c3 = 0x1.7b5756p-6, + .c4 = 0x1.4ef4cep-8, .c5 = 0x1.0e1e74p-7, + + .pio2_1 = 0x1.921fb6p+0f, .pio2_2 = -0x1.777a5cp-25f, + .pio2_3 = -0x1.ee59dap-50f, .invpio2 = 0x1.45f306p-1f, + .range_val = 0x1p15f, .shift = 0x1.8p+23f +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) +{ + return sv_call_f32 (tanf, x, y, cmp); +} + +/* Fast implementation of SVE tanf. + Maximum error is 3.45 ULP: + SV_NAME_F1 (tan)(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); + + /* n = rint(x/(pi/2)). */ + svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); + /* n is already a signed integer, simply convert it. */ + svint32_t in = svcvt_s32_x (pg, n); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + svint32_t alt = svand_x (pg, in, 1); + svbool_t pred_alt = svcmpne (pg, alt, 0); + /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ + svfloat32_t r; + r = svmls_lane (x, n, pi_vals, 0); + r = svmls_lane (r, n, pi_vals, 1); + r = svmls_lane (r, n, pi_vals, 2); + + /* If x lives in an interval, where |tan(x)| + - is finite, then use a polynomial approximation of the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use symmetries of tangent and the identity + tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use + the same polynomial approximation of tan as above. */ + + /* Perform additional reduction if required. */ + svfloat32_t z = svneg_m (r, pred_alt, r); + + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], + using Estrin on z^2. */ + svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); + + svfloat32_t z4 = svmul_x (pg, z2, z2); + svfloat32_t p = svmla_x (pg, p01, z4, p23); + + svfloat32_t z8 = svmul_x (pg, z4, z4); + p = svmla_x (pg, p, z8, p45); + + svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + + /* Determine whether input is too large to perform fast regression. */ + svbool_t cmp = svacge (pg, x, d->range_val); + if (unlikely (svptest_any (pg, cmp))) + return special_case (x, svdivr_x (pg, y, 1.0f), cmp); + + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); + return svsel (pred_alt, inv_y, y); +} + +TEST_SIG (SV, F, 1, tan, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (tan), 2.96) +TEST_DISABLE_FENV (SV_NAME_F1 (tan)) +TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanh.c b/math/aarch64/sve/tanh.c new file mode 100644 index 000000000000..41f64cb4b2c7 --- /dev/null +++ b/math/aarch64/sve/tanh.c @@ -0,0 +1,98 @@ +/* + * Double-precision SVE tanh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64_t poly[11]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64_t thresh, tiny_bound; +} data = { + /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ + .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, + 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, + + .inv_ln2 = 0x1.71547652b82fep0, + .ln2_hi = -0x1.62e42fefa39efp-1, + .ln2_lo = -0x1.abc9e3b39803fp-56, + .shift = 0x1.8p52, + + .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = 0x01f241bf835f9d5f, +}; + +static inline svfloat64_t +expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Helper routine for calculating exp(x) - 1. Vector port of the helper from + the scalar variant of tanh. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + svfloat64_t j + = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); + svint64_t i = svcvt_s64_x (pg, j); + svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi); + f = svmla_x (pg, f, j, d->ln2_lo); + + /* Approximate expm1(f) using polynomial. */ + svfloat64_t f2 = svmul_x (pg, f, f); + svfloat64_t f4 = svmul_x (pg, f2, f2); + svfloat64_t p = svmla_x ( + pg, f, f2, + sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly)); + + /* t = 2 ^ i. */ + svfloat64_t t = svscale_x (pg, sv_f64 (1), i); + /* expm1(x) = p * t + (t - 1). */ + return svmla_x (pg, svsub_x (pg, t, 1), p, t); +} + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (tanh, x, y, special); +} + +/* SVE approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.77 ULP: + _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 + want -0x1.bd6a21a163624p-3. */ +svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x)); + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh); + + svfloat64_t u = svadd_x (pg, x, x); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat64_t q = expm1_inline (u, pg, d); + svfloat64_t qp2 = svadd_x (pg, q, 2); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, svdiv_x (pg, q, qp2), special); + return svdiv_x (pg, q, qp2); +} + +TEST_SIG (SV, D, 1, tanh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (tanh), 2.27) +TEST_DISABLE_FENV (SV_NAME_D1 (tanh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanhf.c b/math/aarch64/sve/tanhf.c new file mode 100644 index 000000000000..9007e7badb0d --- /dev/null +++ b/math/aarch64/sve/tanhf.c @@ -0,0 +1,68 @@ +/* + * Single-precision SVE tanh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expm1f_inline.h" + +/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */ +#define BoringBound 0x1.205966p+3f + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t onef, special_bound; + float boring_bound; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + .onef = 0x3f800000, + .special_bound = 0x7f800000, + .boring_bound = BoringBound, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring, + svfloat32_t boring, svfloat32_t q, svbool_t special) +{ + svfloat32_t y + = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0))); + return sv_call_f32 (tanhf, x, y, special); +} + +/* Approximation for single-precision SVE tanh(x), using a simplified + version of expm1f. The maximum error is 2.57 ULP: + _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5 + want 0x1.fb71aap-5. */ +svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); + svbool_t special = svcmpgt (pg, iax, d->special_bound); + svbool_t is_boring = svacgt (pg, x, d->boring_bound); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg, + &d->expm1f_consts); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg, is_boring, boring, q, special); + svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); + return svsel_f32 (is_boring, boring, y); +} + +TEST_SIG (SV, F, 1, tanh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (tanh), 2.07) +TEST_DISABLE_FENV (SV_NAME_F1 (tanh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, BoringBound, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), BoringBound, inf, 100) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanpi.c b/math/aarch64/sve/tanpi.c new file mode 100644 index 000000000000..d9e7d2487d53 --- /dev/null +++ b/math/aarch64/sve/tanpi.c @@ -0,0 +1,89 @@ +/* + * Double-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpi_data +{ + double c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = 0x1.921fb54442d18p1, /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = 0x1.466bc6775b0f9p5, + .c3 = 0x1.45fff9b426f5ep7, .c4 = 0x1.45f4730dbca5cp9, + .c5 = 0x1.45f3265994f85p11, .c6 = 0x1.45f4234b330cap13, + .c7 = 0x1.45dca11be79ebp15, .c8 = 0x1.47283fc5eea69p17, + .c9 = 0x1.3a6d958cdefaep19, .c10 = 0x1.927896baee627p21, + .c11 = -0x1.89333f6acd922p19, .c12 = 0x1.5d4e912bb8456p27, + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVsMxv_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +svfloat64_t SV_NAME_D1 (tanpi) (svfloat64_t x, const svbool_t pg) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + svfloat64_t n = svrintn_x (pg, x); + + /* inf produces nan that propagates. */ + svfloat64_t xr = svsub_x (pg, x, n); + svfloat64_t ar = svabd_x (pg, x, n); + svbool_t flip = svcmpgt (pg, ar, 0.25); + svfloat64_t r = svsel (flip, svsubr_x (pg, ar, 0.5), ar); + + /* Order-14 pairwise Horner. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + + svfloat64_t c_1_3 = svld1rq (pg, &d->c1); + svfloat64_t c_5_7 = svld1rq (pg, &d->c5); + svfloat64_t c_9_11 = svld1rq (pg, &d->c9); + svfloat64_t c_13_14 = svld1rq (pg, &d->c13); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r2, c_1_3, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r2, c_1_3, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r2, c_5_7, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), r2, c_5_7, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), r2, c_9_11, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), r2, c_9_11, 1); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), r2, c_13_14, 0); + + svfloat64_t p = svmla_lane (p1213, r4, c_13_14, 1); + p = svmad_x (pg, p, r4, p1011); + p = svmad_x (pg, p, r4, p89); + p = svmad_x (pg, p, r4, p67); + p = svmad_x (pg, p, r4, p45); + p = svmad_x (pg, p, r4, p23); + p = svmad_x (pg, p, r4, p01); + p = svmul_x (pg, r, p); + + svfloat64_t p_recip = svdivr_x (pg, p, 1.0); + svfloat64_t y = svsel (flip, p_recip, p); + + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (xr), svreinterpret_u64 (ar)); + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (SV_NAME_D1 (tanpi)) +TEST_ULP (SV_NAME_D1 (tanpi), 2.57) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0.5, 1.0, 200000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 1.0, 0x1p23, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0x1p23, inf, 50000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanpif.c b/math/aarch64/sve/tanpif.c new file mode 100644 index 000000000000..2ba968a799fe --- /dev/null +++ b/math/aarch64/sve/tanpif.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector tanpif(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_defs.h" +#include "test_sig.h" + +const static struct v_tanpif_data +{ + float c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficients for tan(pi * x). */ + .c0 = 0x1.921fb4p1f, .c1 = 0x1.4abbcep3f, .c2 = 0x1.466b8p5f, + .c3 = 0x1.461c72p7f, .c4 = 0x1.42e9d4p9f, .c5 = 0x1.69e2c4p11f, + .c6 = 0x1.e85558p11f, .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpif(x) + The maximum error is 3.34 ULP: + _ZGVsMxv_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +svfloat32_t SV_NAME_F1 (tanpi) (svfloat32_t x, const svbool_t pg) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + svfloat32_t odd_coeffs = svld1rq (pg, &d->c1); + svfloat32_t n = svrintn_x (pg, x); + + /* inf produces nan that propagates. */ + svfloat32_t xr = svsub_x (pg, x, n); + svfloat32_t ar = svabd_x (pg, x, n); + svbool_t flip = svcmpgt (pg, ar, 0.25f); + svfloat32_t r = svsel (flip, svsub_x (pg, sv_f32 (0.5f), ar), ar); + + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r4 = svmul_x (pg, r2, r2); + + /* Order-7 Pairwise Horner. */ + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), r2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), r2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), r2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), r2, odd_coeffs, 3); + svfloat32_t p = svmad_x (pg, p67, r4, p45); + p = svmad_x (pg, p, r4, p23); + p = svmad_x (pg, p, r4, p01); + svfloat32_t poly = svmul_x (pg, r, p); + + svfloat32_t poly_recip = svdiv_x (pg, sv_f32 (1.0), poly); + svfloat32_t y = svsel (flip, poly_recip, poly); + + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (xr), svreinterpret_u32 (ar)); + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (SV_NAME_F1 (tanpi)) +TEST_ULP (SV_NAME_F1 (tanpi), 2.84) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0x1p23f, inf, 100000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/tanpi_2u5.c b/math/aarch64/tanpi_2u5.c new file mode 100644 index 000000000000..154b9faf454d --- /dev/null +++ b/math/aarch64/tanpi_2u5.c @@ -0,0 +1,158 @@ +/* + * Double-precision scalar tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f64.h" + +#define SIGN_MASK 0x8000000000000000 + +const static struct tanpi_data +{ + double tan_poly[14], cot_poly[9], pi, invpi; +} tanpi_data = { + /* Coefficents for tan(pi * x). */ + .tan_poly = { + 0x1.4abbce625be52p3, + 0x1.466bc6775b0f9p5, + 0x1.45fff9b426f5ep7, + 0x1.45f4730dbca5cp9, + 0x1.45f3265994f85p11, + 0x1.45f4234b330cap13, + 0x1.45dca11be79ebp15, + 0x1.47283fc5eea69p17, + 0x1.3a6d958cdefaep19, + 0x1.927896baee627p21, + -0x1.89333f6acd922p19, + 0x1.5d4e912bb8456p27, + -0x1.a854d53ab6874p29, + 0x1.1b76de7681424p32, + }, + /* Coefficents for cot(pi * x). */ + .cot_poly = { + -0x1.0c152382d7366p0, + -0x1.60c8539c1d316p-1, + -0x1.4b9a2f3516354p-1, + -0x1.47474060b6ba8p-1, + -0x1.464633ad9dcb1p-1, + -0x1.45ff229d7edd6p-1, + -0x1.46d8dbf492923p-1, + -0x1.3873892311c6bp-1, + -0x1.b2f3d0ff96d73p-1, + }, + .pi = 0x1.921fb54442d18p1, + .invpi = 0x1.45f306dc9c883p-2, +}; + +/* Double-precision scalar tanpi(x) implementation. + Maximum error 2.19 ULP: + tanpi(0x1.68847e177a855p-2) got 0x1.fe9a0ff9bb9d7p+0 + want 0x1.fe9a0ff9bb9d5p+0. */ +double +arm_math_tanpi (double x) +{ + uint64_t xabs_12 = asuint64 (x) >> 52 & 0x7ff; + + /* x >= 0x1p54. */ + if (unlikely (xabs_12 >= 0x434)) + { + /* tanpi(+/-inf) and tanpi(+/-nan) = nan. */ + if (unlikely (xabs_12 == 0x7ff)) + { + return __math_invalid (x); + } + + uint64_t x_sign = asuint64 (x) & SIGN_MASK; + return asdouble (x_sign); + } + + const struct tanpi_data *d = ptr_barrier (&tanpi_data); + + double rounded = round (x); + if (unlikely (rounded == x)) + { + /* If x == 0, return with sign. */ + if (x == 0) + { + return x; + } + /* Otherwise, return zero with alternating sign. */ + int64_t m = (int64_t) rounded; + if (x < 0) + { + return m & 1 ? 0.0 : -0.0; + } + else + { + return m & 1 ? -0.0 : 0.0; + } + } + + double x_reduced = x - rounded; + double abs_x_reduced = 0.5 - fabs (x_reduced); + + /* Prevent underflow exceptions. x <= 0x1p-63. */ + if (unlikely (xabs_12 < 0x3c0)) + { + return d->pi * x; + } + + double result, offset, scale; + + /* Test 0.25 < abs_x < 0.5 independent from abs_x_reduced. */ + double x2 = x + x; + int64_t rounded_x2 = (int64_t) round (x2); + if (rounded_x2 & 1) + { + double r_x = abs_x_reduced; + + double r_x2 = r_x * r_x; + double r_x4 = r_x2 * r_x2; + + uint64_t sign = asuint64 (x_reduced) & SIGN_MASK; + r_x = asdouble (asuint64 (r_x) ^ sign); + + // calculate sign for half-fractional inf values + uint64_t is_finite = asuint64 (abs_x_reduced); + uint64_t is_odd = (rounded_x2 & 2) << 62; + uint64_t is_neg = rounded_x2 & SIGN_MASK; + uint64_t keep_sign = is_finite | (is_odd ^ is_neg); + offset = d->invpi / (keep_sign ? r_x : -r_x); + scale = r_x; + + result = pw_horner_8_f64 (r_x2, r_x4, d->cot_poly); + } + else + { + double r_x2 = x_reduced * x_reduced; + double r_x4 = r_x2 * r_x2; + + offset = d->pi * x_reduced; + scale = x_reduced * r_x2; + + result = pw_horner_13_f64 (r_x2, r_x4, d->tan_poly); + } + + return fma (scale, result, offset); +} + +#if WANT_EXPERIMENTAL_MATH +double +tanpi (double x) +{ + return arm_math_tanpi (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_tanpi, 1.69) +TEST_SYM_INTERVAL (arm_math_tanpi, 0, 0x1p-63, 50000) +TEST_SYM_INTERVAL (arm_math_tanpi, 0x1p-63, 0.5, 100000) +TEST_SYM_INTERVAL (arm_math_tanpi, 0.5, 0x1p53, 100000) +TEST_SYM_INTERVAL (arm_math_tanpi, 0x1p53, inf, 100000) +#endif diff --git a/math/aarch64/tanpif_3u1.c b/math/aarch64/tanpif_3u1.c new file mode 100644 index 000000000000..8cd66594c290 --- /dev/null +++ b/math/aarch64/tanpif_3u1.c @@ -0,0 +1,145 @@ +/* + * Single-precision scalar tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f32.h" + +const static struct tanpif_data +{ + float tan_poly[6], cot_poly[4], pi, invpi; +} tanpif_data = { + /* Coefficents for tan(pi * x). */ + .tan_poly = { + 0x1.4abbc8p3, + 0x1.467284p5, + 0x1.44cf12p7, + 0x1.596b5p9, + 0x1.753858p10, + 0x1.76ff52p14, + }, + /* Coefficents for cot(pi * x). */ + .cot_poly = { + -0x1.0c1522p0, + -0x1.60ce32p-1, + -0x1.49cd42p-1, + -0x1.73f786p-1, + }, + .pi = 0x1.921fb6p1f, + .invpi = 0x1.45f308p-2f, +}; + +/* Single-precision scalar tanpi(x) implementation. + Maximum error 2.56 ULP: + tanpif(0x1.4bf948p-1) got -0x1.fcc9ep+0 + want -0x1.fcc9e6p+0. */ +float +arm_math_tanpif (float x) +{ + uint32_t xabs_12 = asuint (x) >> 20 & 0x7f8; + + /* x >= 0x1p24f. */ + if (unlikely (xabs_12 >= 0x4b1)) + { + /* tanpif(+/-inf) and tanpif(+/-nan) = nan. */ + if (unlikely (xabs_12 == 0x7f8)) + { + return __math_invalidf (x); + } + + uint32_t x_sign = asuint (x) & 0x80000000; + return asfloat (x_sign); + } + + const struct tanpif_data *d = ptr_barrier (&tanpif_data); + + /* Prevent underflow exceptions. x <= 0x1p-31. */ + if (unlikely (xabs_12 < 0x300)) + { + return d->pi * x; + } + + float rounded = roundf (x); + if (unlikely (rounded == x)) + { + /* If x == 0, return with sign. */ + if (x == 0) + { + return x; + } + /* Otherwise, return zero with alternating sign. */ + int32_t m = (int32_t) rounded; + if (x < 0) + { + return m & 1 ? 0.0f : -0.0f; + } + else + { + return m & 1 ? -0.0f : 0.0f; + } + } + + float x_reduced = x - rounded; + float abs_x_reduced = 0.5f - asfloat (asuint (x_reduced) & 0x7fffffff); + + float result, offset, scale; + + /* Test 0.25 < abs_x < 0.5 independent from abs_x_reduced. */ + float x2 = x + x; + int32_t rounded_x2 = (int32_t) roundf (x2); + if (rounded_x2 & 1) + { + float r_x = abs_x_reduced; + + float r_x2 = r_x * r_x; + float r_x4 = r_x2 * r_x2; + + uint32_t sign = asuint (x_reduced) & 0x80000000; + r_x = asfloat (asuint (r_x) ^ sign); + + // calculate sign for half-fractional inf values + uint32_t is_finite = asuint (abs_x_reduced); + uint32_t is_odd = (rounded_x2 & 2) << 30; + uint32_t is_neg = rounded_x2 & 0x80000000; + uint32_t keep_sign = is_finite | (is_odd ^ is_neg); + offset = d->invpi / (keep_sign ? r_x : -r_x); + scale = r_x; + + result = pairwise_poly_3_f32 (r_x2, r_x4, d->cot_poly); + } + else + { + float r_x = x_reduced; + + float r_x2 = r_x * r_x; + float r_x4 = r_x2 * r_x2; + + offset = d->pi * r_x; + scale = r_x * r_x2; + + result = pw_horner_5_f32 (r_x2, r_x4, d->tan_poly); + } + + return fmaf (scale, result, offset); +} + +#if WANT_EXPERIMENTAL_MATH +float +tanpif (float x) +{ + return arm_math_tanpif (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_tanpif, 2.57) +TEST_SYM_INTERVAL (arm_math_tanpif, 0, 0x1p-31f, 50000) +TEST_SYM_INTERVAL (arm_math_tanpif, 0x1p-31f, 0.5, 100000) +TEST_SYM_INTERVAL (arm_math_tanpif, 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (arm_math_tanpif, 0x1p23f, inf, 100000) +#endif diff --git a/math/aarch64/v_erf_data.c b/math/aarch64/v_erf_data.c new file mode 100644 index 000000000000..5400d6b8d0e3 --- /dev/null +++ b/math/aarch64/v_erf_data.c @@ -0,0 +1,788 @@ +/* + * Data for approximation of erf. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in vector erf. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 6.0 (769 values): + - the first entry __v_erff_data.tab.erf contains the values of erf(r), + - the second entry __v_erff_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct v_erf_data __v_erf_data = { + .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, + { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, + { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, + { 0x1.b137e0cf584dcp-6, 0x1.20b4d8bac36c1p+0 }, + { 0x1.20c5645dd2538p-5, 0x1.209546ad13ccfp+0 }, + { 0x1.68e5d3bbc9526p-5, 0x1.206cb4897b148p+0 }, + { 0x1.b0fafef135745p-5, 0x1.203b261cd0052p+0 }, + { 0x1.f902a77bd3821p-5, 0x1.2000a00ae3804p+0 }, + { 0x1.207d480e90658p-4, 0x1.1fbd27cdc72d3p+0 }, + { 0x1.44703e87e8593p-4, 0x1.1f70c3b4f2cc7p+0 }, + { 0x1.68591a1e83b5dp-4, 0x1.1f1b7ae44867fp+0 }, + { 0x1.8c36beb8a8d23p-4, 0x1.1ebd5552f795bp+0 }, + { 0x1.b0081148a873ap-4, 0x1.1e565bca400d4p+0 }, + { 0x1.d3cbf7e70a4b3p-4, 0x1.1de697e413d28p+0 }, + { 0x1.f78159ec8bb50p-4, 0x1.1d6e14099944ap+0 }, + { 0x1.0d939005f65e5p-3, 0x1.1cecdb718d61cp+0 }, + { 0x1.1f5e1a35c3b89p-3, 0x1.1c62fa1e869b6p+0 }, + { 0x1.311fc15f56d14p-3, 0x1.1bd07cdd189acp+0 }, + { 0x1.42d7fc2f64959p-3, 0x1.1b357141d95d5p+0 }, + { 0x1.548642321d7c6p-3, 0x1.1a91e5a748165p+0 }, + { 0x1.662a0bdf7a89fp-3, 0x1.19e5e92b964abp+0 }, + { 0x1.77c2d2a765f9ep-3, 0x1.19318bae53a04p+0 }, + { 0x1.895010fdbdbfdp-3, 0x1.1874ddcdfce24p+0 }, + { 0x1.9ad142662e14dp-3, 0x1.17aff0e56ec10p+0 }, + { 0x1.ac45e37fe2526p-3, 0x1.16e2d7093cd8cp+0 }, + { 0x1.bdad72110a648p-3, 0x1.160da304ed92fp+0 }, + { 0x1.cf076d1233237p-3, 0x1.153068581b781p+0 }, + { 0x1.e05354b96ff36p-3, 0x1.144b3b337c90cp+0 }, + { 0x1.f190aa85540e2p-3, 0x1.135e3075d076bp+0 }, + { 0x1.015f78a3dcf3dp-2, 0x1.12695da8b5bdep+0 }, + { 0x1.09eed6982b948p-2, 0x1.116cd8fd67618p+0 }, + { 0x1.127631eb8de32p-2, 0x1.1068b94962e5ep+0 }, + { 0x1.1af54e232d609p-2, 0x1.0f5d1602f7e41p+0 }, + { 0x1.236bef825d9a2p-2, 0x1.0e4a073dc1b91p+0 }, + { 0x1.2bd9db0f7827fp-2, 0x1.0d2fa5a70c168p+0 }, + { 0x1.343ed6989b7d9p-2, 0x1.0c0e0a8223359p+0 }, + { 0x1.3c9aa8b84bedap-2, 0x1.0ae54fa490722p+0 }, + { 0x1.44ed18d9f6462p-2, 0x1.09b58f724416bp+0 }, + { 0x1.4d35ef3e5372ep-2, 0x1.087ee4d9ad247p+0 }, + { 0x1.5574f4ffac98ep-2, 0x1.07416b4fbfe7cp+0 }, + { 0x1.5da9f415ff23fp-2, 0x1.05fd3ecbec297p+0 }, + { 0x1.65d4b75b00471p-2, 0x1.04b27bc403d30p+0 }, + { 0x1.6df50a8dff772p-2, 0x1.03613f2812dafp+0 }, + { 0x1.760aba57a76bfp-2, 0x1.0209a65e29545p+0 }, + { 0x1.7e15944d9d3e4p-2, 0x1.00abcf3e187a9p+0 }, + { 0x1.861566f5fd3c0p-2, 0x1.fe8fb01a47307p-1 }, + { 0x1.8e0a01cab516bp-2, 0x1.fbbbbef34b4b2p-1 }, + { 0x1.95f3353cbb146p-2, 0x1.f8dc092d58ff8p-1 }, + { 0x1.9dd0d2b721f39p-2, 0x1.f5f0cdaf15313p-1 }, + { 0x1.a5a2aca209394p-2, 0x1.f2fa4c16c0019p-1 }, + { 0x1.ad68966569a87p-2, 0x1.eff8c4b1375dbp-1 }, + { 0x1.b522646bbda68p-2, 0x1.ecec7870ebca7p-1 }, + { 0x1.bccfec24855b8p-2, 0x1.e9d5a8e4c934ep-1 }, + { 0x1.c4710406a65fcp-2, 0x1.e6b4982f158b9p-1 }, + { 0x1.cc058392a6d2dp-2, 0x1.e38988fc46e72p-1 }, + { 0x1.d38d4354c3bd0p-2, 0x1.e054be79d3042p-1 }, + { 0x1.db081ce6e2a48p-2, 0x1.dd167c4cf9d2ap-1 }, + { 0x1.e275eaf25e458p-2, 0x1.d9cf06898cdafp-1 }, + { 0x1.e9d68931ae650p-2, 0x1.d67ea1a8b5368p-1 }, + { 0x1.f129d471eabb1p-2, 0x1.d325927fb9d89p-1 }, + { 0x1.f86faa9428f9dp-2, 0x1.cfc41e36c7df9p-1 }, + { 0x1.ffa7ea8eb5fd0p-2, 0x1.cc5a8a3fbea40p-1 }, + { 0x1.03693a371519cp-1, 0x1.c8e91c4d01368p-1 }, + { 0x1.06f794ab2cae7p-1, 0x1.c5701a484ef9dp-1 }, + { 0x1.0a7ef5c18edd2p-1, 0x1.c1efca49a5011p-1 }, + { 0x1.0dff4f247f6c6p-1, 0x1.be68728e29d5dp-1 }, + { 0x1.1178930ada115p-1, 0x1.bada596f25436p-1 }, + { 0x1.14eab43841b55p-1, 0x1.b745c55905bf8p-1 }, + { 0x1.1855a5fd3dd50p-1, 0x1.b3aafcc27502ep-1 }, + { 0x1.1bb95c3746199p-1, 0x1.b00a46237d5bep-1 }, + { 0x1.1f15cb50bc4dep-1, 0x1.ac63e7ecc1411p-1 }, + { 0x1.226ae840d4d70p-1, 0x1.a8b8287ec6a09p-1 }, + { 0x1.25b8a88b6dd7fp-1, 0x1.a5074e2157620p-1 }, + { 0x1.28ff0240d52cdp-1, 0x1.a1519efaf889ep-1 }, + { 0x1.2c3debfd7d6c1p-1, 0x1.9d97610879642p-1 }, + { 0x1.2f755ce9a21f4p-1, 0x1.99d8da149c13fp-1 }, + { 0x1.32a54cb8db67bp-1, 0x1.96164fafd8de3p-1 }, + { 0x1.35cdb3a9a144dp-1, 0x1.925007283d7aap-1 }, + { 0x1.38ee8a84beb71p-1, 0x1.8e86458169af8p-1 }, + { 0x1.3c07ca9cb4f9ep-1, 0x1.8ab94f6caa71dp-1 }, + { 0x1.3f196dcd0f135p-1, 0x1.86e9694134b9ep-1 }, + { 0x1.42236e79a5fa6p-1, 0x1.8316d6f48133dp-1 }, + { 0x1.4525c78dd5966p-1, 0x1.7f41dc12c9e89p-1 }, + { 0x1.4820747ba2dc2p-1, 0x1.7b6abbb7aaf19p-1 }, + { 0x1.4b13713ad3513p-1, 0x1.7791b886e7403p-1 }, + { 0x1.4dfeba47f63ccp-1, 0x1.73b714a552763p-1 }, + { 0x1.50e24ca35fd2cp-1, 0x1.6fdb11b1e0c34p-1 }, + { 0x1.53be25d016a4fp-1, 0x1.6bfdf0beddaf5p-1 }, + { 0x1.569243d2b3a9bp-1, 0x1.681ff24b4ab04p-1 }, + { 0x1.595ea53035283p-1, 0x1.6441563c665d4p-1 }, + { 0x1.5c2348ecc4dc3p-1, 0x1.60625bd75d07bp-1 }, + { 0x1.5ee02e8a71a53p-1, 0x1.5c8341bb23767p-1 }, + { 0x1.61955607dd15dp-1, 0x1.58a445da7c74cp-1 }, + { 0x1.6442bfdedd397p-1, 0x1.54c5a57629db0p-1 }, + { 0x1.66e86d0312e82p-1, 0x1.50e79d1749ac9p-1 }, + { 0x1.69865ee075011p-1, 0x1.4d0a6889dfd9fp-1 }, + { 0x1.6c1c9759d0e5fp-1, 0x1.492e42d78d2c5p-1 }, + { 0x1.6eab18c74091bp-1, 0x1.4553664273d24p-1 }, + { 0x1.7131e5f496a5ap-1, 0x1.417a0c4049fd0p-1 }, + { 0x1.73b1021fc0cb8p-1, 0x1.3da26d759aef5p-1 }, + { 0x1.762870f720c6fp-1, 0x1.39ccc1b136d5ap-1 }, + { 0x1.78983697dc96fp-1, 0x1.35f93fe7d1b3dp-1 }, + { 0x1.7b00578c26037p-1, 0x1.32281e2fd1a92p-1 }, + { 0x1.7d60d8c979f7bp-1, 0x1.2e5991bd4cbfcp-1 }, + { 0x1.7fb9bfaed8078p-1, 0x1.2a8dcede3673bp-1 }, + { 0x1.820b1202f27fbp-1, 0x1.26c508f6bd0ffp-1 }, + { 0x1.8454d5f25760dp-1, 0x1.22ff727dd6f7bp-1 }, + { 0x1.8697120d92a4ap-1, 0x1.1f3d3cf9ffe5ap-1 }, + { 0x1.88d1cd474a2e0p-1, 0x1.1b7e98fe26217p-1 }, + { 0x1.8b050ef253c37p-1, 0x1.17c3b626c7a11p-1 }, + { 0x1.8d30debfc572ep-1, 0x1.140cc3173f007p-1 }, + { 0x1.8f5544bd00c04p-1, 0x1.1059ed7740313p-1 }, + { 0x1.91724951b8fc6p-1, 0x1.0cab61f084b93p-1 }, + { 0x1.9387f53df5238p-1, 0x1.09014c2ca74dap-1 }, + { 0x1.959651980da31p-1, 0x1.055bd6d32e8d7p-1 }, + { 0x1.979d67caa6631p-1, 0x1.01bb2b87c6968p-1 }, + { 0x1.999d4192a5715p-1, 0x1.fc3ee5d1524b0p-2 }, + { 0x1.9b95e8fd26abap-1, 0x1.f511a91a67d2ap-2 }, + { 0x1.9d8768656cc42p-1, 0x1.edeeee0959518p-2 }, + { 0x1.9f71ca72cffb6p-1, 0x1.e6d6ffaa65a25p-2 }, + { 0x1.a1551a16aaeafp-1, 0x1.dfca26f5bbf88p-2 }, + { 0x1.a331628a45b92p-1, 0x1.d8c8aace11e63p-2 }, + { 0x1.a506af4cc00f4p-1, 0x1.d1d2cfff91594p-2 }, + { 0x1.a6d50c20fa293p-1, 0x1.cae8d93f1d7b6p-2 }, + { 0x1.a89c850b7d54dp-1, 0x1.c40b0729ed547p-2 }, + { 0x1.aa5d265064366p-1, 0x1.bd3998457afdap-2 }, + { 0x1.ac16fc7143263p-1, 0x1.b674c8ffc6283p-2 }, + { 0x1.adca142b10f98p-1, 0x1.afbcd3afe8ab6p-2 }, + { 0x1.af767a741088bp-1, 0x1.a911f096fbc26p-2 }, + { 0x1.b11c3c79bb424p-1, 0x1.a27455e14c93cp-2 }, + { 0x1.b2bb679ead19cp-1, 0x1.9be437a7de946p-2 }, + { 0x1.b4540978921eep-1, 0x1.9561c7f23a47bp-2 }, + { 0x1.b5e62fce16095p-1, 0x1.8eed36b886d93p-2 }, + { 0x1.b771e894d602ep-1, 0x1.8886b1e5ecfd1p-2 }, + { 0x1.b8f741ef54f83p-1, 0x1.822e655b417e6p-2 }, + { 0x1.ba764a2af2b78p-1, 0x1.7be47af1f5d89p-2 }, + { 0x1.bbef0fbde6221p-1, 0x1.75a91a7f4d2edp-2 }, + { 0x1.bd61a1453ab44p-1, 0x1.6f7c69d7d3ef8p-2 }, + { 0x1.bece0d82d1a5cp-1, 0x1.695e8cd31867ep-2 }, + { 0x1.c034635b66e23p-1, 0x1.634fa54fa285fp-2 }, + { 0x1.c194b1d49a184p-1, 0x1.5d4fd33729015p-2 }, + { 0x1.c2ef0812fc1bdp-1, 0x1.575f3483021c3p-2 }, + { 0x1.c443755820d64p-1, 0x1.517de540ce2a3p-2 }, + { 0x1.c5920900b5fd1p-1, 0x1.4babff975a04cp-2 }, + { 0x1.c6dad2829ec62p-1, 0x1.45e99bcbb7915p-2 }, + { 0x1.c81de16b14cefp-1, 0x1.4036d0468a7a2p-2 }, + { 0x1.c95b455cce69dp-1, 0x1.3a93b1998736cp-2 }, + { 0x1.ca930e0e2a825p-1, 0x1.35005285227f1p-2 }, + { 0x1.cbc54b476248dp-1, 0x1.2f7cc3fe6f423p-2 }, + { 0x1.ccf20ce0c0d27p-1, 0x1.2a09153529381p-2 }, + { 0x1.ce1962c0e0d8bp-1, 0x1.24a55399ea239p-2 }, + { 0x1.cf3b5cdaf0c39p-1, 0x1.1f518ae487dc8p-2 }, + { 0x1.d0580b2cfd249p-1, 0x1.1a0dc51a9934dp-2 }, + { 0x1.d16f7dbe41ca0p-1, 0x1.14da0a961fd14p-2 }, + { 0x1.d281c49d818d0p-1, 0x1.0fb6620c550afp-2 }, + { 0x1.d38eefdf64fddp-1, 0x1.0aa2d09497f2bp-2 }, + { 0x1.d4970f9ce00d9p-1, 0x1.059f59af7a906p-2 }, + { 0x1.d59a33f19ed42p-1, 0x1.00abff4dec7a3p-2 }, + { 0x1.d6986cfa798e7p-1, 0x1.f79183b101c5bp-3 }, + { 0x1.d791cad3eff01p-1, 0x1.edeb406d9c824p-3 }, + { 0x1.d8865d98abe01p-1, 0x1.e4652fadcb6b2p-3 }, + { 0x1.d97635600bb89p-1, 0x1.daff4969c0b04p-3 }, + { 0x1.da61623cb41e0p-1, 0x1.d1b982c501370p-3 }, + { 0x1.db47f43b2980dp-1, 0x1.c893ce1dcbef7p-3 }, + { 0x1.dc29fb60715afp-1, 0x1.bf8e1b1ca2279p-3 }, + { 0x1.dd0787a8bb39dp-1, 0x1.b6a856c3ed54fp-3 }, + { 0x1.dde0a90611a0dp-1, 0x1.ade26b7fbed95p-3 }, + { 0x1.deb56f5f12d28p-1, 0x1.a53c4135a6526p-3 }, + { 0x1.df85ea8db188ep-1, 0x1.9cb5bd549b111p-3 }, + { 0x1.e0522a5dfda73p-1, 0x1.944ec2e4f5630p-3 }, + { 0x1.e11a3e8cf4eb8p-1, 0x1.8c07329874652p-3 }, + { 0x1.e1de36c75ba58p-1, 0x1.83deeada4d25ap-3 }, + { 0x1.e29e22a89d766p-1, 0x1.7bd5c7df3fe9cp-3 }, + { 0x1.e35a11b9b61cep-1, 0x1.73eba3b5b07b7p-3 }, + { 0x1.e4121370224ccp-1, 0x1.6c205655be71fp-3 }, + { 0x1.e4c6372cd8927p-1, 0x1.6473b5b15a7a1p-3 }, + { 0x1.e5768c3b4a3fcp-1, 0x1.5ce595c455b0ap-3 }, + { 0x1.e62321d06c5e0p-1, 0x1.5575c8a468361p-3 }, + { 0x1.e6cc0709c8a0dp-1, 0x1.4e241e912c305p-3 }, + { 0x1.e7714aec96534p-1, 0x1.46f066040a832p-3 }, + { 0x1.e812fc64db369p-1, 0x1.3fda6bc016994p-3 }, + { 0x1.e8b12a44944a8p-1, 0x1.38e1fae1d6a9dp-3 }, + { 0x1.e94be342e6743p-1, 0x1.3206dceef5f87p-3 }, + { 0x1.e9e335fb56f87p-1, 0x1.2b48d9e5dea1cp-3 }, + { 0x1.ea7730ed0bbb9p-1, 0x1.24a7b84d38971p-3 }, + { 0x1.eb07e27a133aap-1, 0x1.1e233d434b813p-3 }, + { 0x1.eb9558e6b42cep-1, 0x1.17bb2c8d41535p-3 }, + { 0x1.ec1fa258c4beap-1, 0x1.116f48a6476ccp-3 }, + { 0x1.eca6ccd709544p-1, 0x1.0b3f52ce8c383p-3 }, + { 0x1.ed2ae6489ac1ep-1, 0x1.052b0b1a174eap-3 }, + { 0x1.edabfc7453e63p-1, 0x1.fe6460fef4680p-4 }, + { 0x1.ee2a1d004692cp-1, 0x1.f2a901ccafb37p-4 }, + { 0x1.eea5557137ae0p-1, 0x1.e723726b824a9p-4 }, + { 0x1.ef1db32a2277cp-1, 0x1.dbd32ac4c99b0p-4 }, + { 0x1.ef93436bc2daap-1, 0x1.d0b7a0f921e7cp-4 }, + { 0x1.f006135426b26p-1, 0x1.c5d0497c09e74p-4 }, + { 0x1.f0762fde45ee6p-1, 0x1.bb1c972f23e50p-4 }, + { 0x1.f0e3a5e1a1788p-1, 0x1.b09bfb7d11a83p-4 }, + { 0x1.f14e8211e8c55p-1, 0x1.a64de673e8837p-4 }, + { 0x1.f1b6d0fea5f4dp-1, 0x1.9c31c6df3b1b8p-4 }, + { 0x1.f21c9f12f0677p-1, 0x1.92470a61b6965p-4 }, + { 0x1.f27ff89525acfp-1, 0x1.888d1d8e510a3p-4 }, + { 0x1.f2e0e9a6a8b09p-1, 0x1.7f036c0107294p-4 }, + { 0x1.f33f7e43a706bp-1, 0x1.75a96077274bap-4 }, + { 0x1.f39bc242e43e6p-1, 0x1.6c7e64e7281cbp-4 }, + { 0x1.f3f5c1558b19ep-1, 0x1.6381e2980956bp-4 }, + { 0x1.f44d870704911p-1, 0x1.5ab342383d177p-4 }, + { 0x1.f4a31ebcd47dfp-1, 0x1.5211ebf41880bp-4 }, + { 0x1.f4f693b67bd77p-1, 0x1.499d478bca735p-4 }, + { 0x1.f547f10d60597p-1, 0x1.4154bc68d75c3p-4 }, + { 0x1.f59741b4b97cfp-1, 0x1.3937b1b319259p-4 }, + { 0x1.f5e4907982a07p-1, 0x1.31458e6542847p-4 }, + { 0x1.f62fe80272419p-1, 0x1.297db960e4f63p-4 }, + { 0x1.f67952cff6282p-1, 0x1.21df9981f8e53p-4 }, + { 0x1.f6c0db3c34641p-1, 0x1.1a6a95b1e786fp-4 }, + { 0x1.f7068b7b10fd9p-1, 0x1.131e14fa1625dp-4 }, + { 0x1.f74a6d9a38383p-1, 0x1.0bf97e95f2a64p-4 }, + { 0x1.f78c8b812d498p-1, 0x1.04fc3a0481321p-4 }, + { 0x1.f7cceef15d631p-1, 0x1.fc4b5e32d6259p-5 }, + { 0x1.f80ba18636f07p-1, 0x1.eeea8c1b1db93p-5 }, + { 0x1.f848acb544e95p-1, 0x1.e1d4cf1e2450ap-5 }, + { 0x1.f88419ce4e184p-1, 0x1.d508f9a1ea64ep-5 }, + { 0x1.f8bdf1fb78370p-1, 0x1.c885df3451a07p-5 }, + { 0x1.f8f63e416ebffp-1, 0x1.bc4a54a84e834p-5 }, + { 0x1.f92d077f8d56dp-1, 0x1.b055303221015p-5 }, + { 0x1.f96256700da8ep-1, 0x1.a4a549829587ep-5 }, + { 0x1.f99633a838a57p-1, 0x1.993979e14fffdp-5 }, + { 0x1.f9c8a7989af0dp-1, 0x1.8e109c4622913p-5 }, + { 0x1.f9f9ba8d3c733p-1, 0x1.83298d717210ep-5 }, + { 0x1.fa2974addae45p-1, 0x1.78832c03aa2b1p-5 }, + { 0x1.fa57ddfe27376p-1, 0x1.6e1c5893c380bp-5 }, + { 0x1.fa84fe5e05c8dp-1, 0x1.63f3f5c4de13bp-5 }, + { 0x1.fab0dd89d1309p-1, 0x1.5a08e85af27e0p-5 }, + { 0x1.fadb831a9f9c3p-1, 0x1.505a174e9c929p-5 }, + { 0x1.fb04f6868a944p-1, 0x1.46e66be002240p-5 }, + { 0x1.fb2d3f20f9101p-1, 0x1.3dacd1a8d8ccdp-5 }, + { 0x1.fb54641aebbc9p-1, 0x1.34ac36ad8dafep-5 }, + { 0x1.fb7a6c834b5a2p-1, 0x1.2be38b6d92415p-5 }, + { 0x1.fb9f5f4739170p-1, 0x1.2351c2f2d1449p-5 }, + { 0x1.fbc3433260ca5p-1, 0x1.1af5d2e04f3f6p-5 }, + { 0x1.fbe61eef4cf6ap-1, 0x1.12ceb37ff9bc3p-5 }, + { 0x1.fc07f907bc794p-1, 0x1.0adb5fcfa8c75p-5 }, + { 0x1.fc28d7e4f9cd0p-1, 0x1.031ad58d56279p-5 }, + { 0x1.fc48c1d033c7ap-1, 0x1.f7182a851bca2p-6 }, + { 0x1.fc67bcf2d7b8fp-1, 0x1.e85c449e377f2p-6 }, + { 0x1.fc85cf56ecd38p-1, 0x1.da0005e5f28dfp-6 }, + { 0x1.fca2fee770c79p-1, 0x1.cc0180af00a8bp-6 }, + { 0x1.fcbf5170b578bp-1, 0x1.be5ecd2fcb5f9p-6 }, + { 0x1.fcdacca0bfb73p-1, 0x1.b1160991ff737p-6 }, + { 0x1.fcf57607a6e7cp-1, 0x1.a4255a00b9f03p-6 }, + { 0x1.fd0f5317f582fp-1, 0x1.978ae8b55ce1bp-6 }, + { 0x1.fd2869270a56fp-1, 0x1.8b44e6031383ep-6 }, + { 0x1.fd40bd6d7a785p-1, 0x1.7f5188610ddc8p-6 }, + { 0x1.fd58550773cb5p-1, 0x1.73af0c737bb45p-6 }, + { 0x1.fd6f34f52013ap-1, 0x1.685bb5134ef13p-6 }, + { 0x1.fd85621b0876dp-1, 0x1.5d55cb54cd53ap-6 }, + { 0x1.fd9ae142795e3p-1, 0x1.529b9e8cf9a1ep-6 }, + { 0x1.fdafb719e6a69p-1, 0x1.482b8455dc491p-6 }, + { 0x1.fdc3e835500b3p-1, 0x1.3e03d891b37dep-6 }, + { 0x1.fdd7790ea5bc0p-1, 0x1.3422fd6d12e2bp-6 }, + { 0x1.fdea6e062d0c9p-1, 0x1.2a875b5ffab56p-6 }, + { 0x1.fdfccb62e52d3p-1, 0x1.212f612dee7fbp-6 }, + { 0x1.fe0e9552ebdd6p-1, 0x1.181983e5133ddp-6 }, + { 0x1.fe1fcfebe2083p-1, 0x1.0f443edc5ce49p-6 }, + { 0x1.fe307f2b503d0p-1, 0x1.06ae13b0d3255p-6 }, + { 0x1.fe40a6f70af4bp-1, 0x1.fcab1483ea7fcp-7 }, + { 0x1.fe504b1d9696cp-1, 0x1.ec72615a894c4p-7 }, + { 0x1.fe5f6f568b301p-1, 0x1.dcaf3691fc448p-7 }, + { 0x1.fe6e1742f7cf6p-1, 0x1.cd5ec93c12431p-7 }, + { 0x1.fe7c466dc57a1p-1, 0x1.be7e5ac24963bp-7 }, + { 0x1.fe8a004c19ae6p-1, 0x1.b00b38d6b3575p-7 }, + { 0x1.fe97483db8670p-1, 0x1.a202bd6372dcep-7 }, + { 0x1.fea4218d6594ap-1, 0x1.94624e78e0fafp-7 }, + { 0x1.feb08f7146046p-1, 0x1.87275e3a6869dp-7 }, + { 0x1.febc950b3fa75p-1, 0x1.7a4f6aca256cbp-7 }, + { 0x1.fec835695932ep-1, 0x1.6dd7fe3358230p-7 }, + { 0x1.fed37386190fbp-1, 0x1.61beae53b72b7p-7 }, + { 0x1.fede5248e38f4p-1, 0x1.56011cc3b036dp-7 }, + { 0x1.fee8d486585eep-1, 0x1.4a9cf6bda3f4cp-7 }, + { 0x1.fef2fd00af31ap-1, 0x1.3f8ff5042a88ep-7 }, + { 0x1.fefcce6813974p-1, 0x1.34d7dbc76d7e5p-7 }, + { 0x1.ff064b5afffbep-1, 0x1.2a727a89a3f14p-7 }, + { 0x1.ff0f766697c76p-1, 0x1.205dac02bd6b9p-7 }, + { 0x1.ff18520700971p-1, 0x1.1697560347b25p-7 }, + { 0x1.ff20e0a7ba8c2p-1, 0x1.0d1d69569b82dp-7 }, + { 0x1.ff2924a3f7a83p-1, 0x1.03ede1a45bfeep-7 }, + { 0x1.ff312046f2339p-1, 0x1.f60d8aa2a88f2p-8 }, + { 0x1.ff38d5cc4227fp-1, 0x1.e4cc4abf7d065p-8 }, + { 0x1.ff404760319b4p-1, 0x1.d4143a9dfe965p-8 }, + { 0x1.ff47772010262p-1, 0x1.c3e1a5f5c077cp-8 }, + { 0x1.ff4e671a85425p-1, 0x1.b430ecf4a83a8p-8 }, + { 0x1.ff55194fe19dfp-1, 0x1.a4fe83fb9db25p-8 }, + { 0x1.ff5b8fb26f5f6p-1, 0x1.9646f35a76623p-8 }, + { 0x1.ff61cc26c1578p-1, 0x1.8806d70b2fc36p-8 }, + { 0x1.ff67d08401202p-1, 0x1.7a3ade6c8b3e4p-8 }, + { 0x1.ff6d9e943c231p-1, 0x1.6cdfcbfc1e263p-8 }, + { 0x1.ff733814af88cp-1, 0x1.5ff2750fe7820p-8 }, + { 0x1.ff789eb6130c9p-1, 0x1.536fc18f7ce5cp-8 }, + { 0x1.ff7dd41ce2b4dp-1, 0x1.4754abacdf1dcp-8 }, + { 0x1.ff82d9e1a76d8p-1, 0x1.3b9e3f9d06e3fp-8 }, + { 0x1.ff87b1913e853p-1, 0x1.30499b503957fp-8 }, + { 0x1.ff8c5cad200a5p-1, 0x1.2553ee2a336bfp-8 }, + { 0x1.ff90dcaba4096p-1, 0x1.1aba78ba3af89p-8 }, + { 0x1.ff9532f846ab0p-1, 0x1.107a8c7323a6ep-8 }, + { 0x1.ff9960f3eb327p-1, 0x1.06918b6355624p-8 }, + { 0x1.ff9d67f51ddbap-1, 0x1.f9f9cfd9c3035p-9 }, + { 0x1.ffa14948549a7p-1, 0x1.e77448fb66bb9p-9 }, + { 0x1.ffa506302ebaep-1, 0x1.d58da68fd1170p-9 }, + { 0x1.ffa89fe5b3625p-1, 0x1.c4412bf4b8f0bp-9 }, + { 0x1.ffac17988ef4bp-1, 0x1.b38a3af2e55b4p-9 }, + { 0x1.ffaf6e6f4f5c0p-1, 0x1.a3645330550ffp-9 }, + { 0x1.ffb2a5879f35ep-1, 0x1.93cb11a30d765p-9 }, + { 0x1.ffb5bdf67fe6fp-1, 0x1.84ba3004a50d0p-9 }, + { 0x1.ffb8b8c88295fp-1, 0x1.762d84469c18fp-9 }, + { 0x1.ffbb970200110p-1, 0x1.6821000795a03p-9 }, + { 0x1.ffbe599f4f9d9p-1, 0x1.5a90b00981d93p-9 }, + { 0x1.ffc10194fcb64p-1, 0x1.4d78bba8ca5fdp-9 }, + { 0x1.ffc38fcffbb7cp-1, 0x1.40d564548fad7p-9 }, + { 0x1.ffc60535dd7f5p-1, 0x1.34a305080681fp-9 }, + { 0x1.ffc862a501fd7p-1, 0x1.28de11c5031ebp-9 }, + { 0x1.ffcaa8f4c9beap-1, 0x1.1d83170fbf6fbp-9 }, + { 0x1.ffccd8f5c66d1p-1, 0x1.128eb96be8798p-9 }, + { 0x1.ffcef371ea4d7p-1, 0x1.07fdb4dafea5fp-9 }, + { 0x1.ffd0f92cb6ba7p-1, 0x1.fb99b8b8279e1p-10 }, + { 0x1.ffd2eae369a07p-1, 0x1.e7f232d9e2630p-10 }, + { 0x1.ffd4c94d29fdbp-1, 0x1.d4fed7195d7e8p-10 }, + { 0x1.ffd6951b33686p-1, 0x1.c2b9cf7f893bfp-10 }, + { 0x1.ffd84ef9009eep-1, 0x1.b11d702b3deb1p-10 }, + { 0x1.ffd9f78c7524ap-1, 0x1.a024365f771bdp-10 }, + { 0x1.ffdb8f7605ee7p-1, 0x1.8fc8c794b03b5p-10 }, + { 0x1.ffdd1750e1220p-1, 0x1.8005f08d6f1efp-10 }, + { 0x1.ffde8fb314ebfp-1, 0x1.70d6a46e07ddap-10 }, + { 0x1.ffdff92db56e5p-1, 0x1.6235fbd7a4345p-10 }, + { 0x1.ffe1544d01ccbp-1, 0x1.541f340697987p-10 }, + { 0x1.ffe2a1988857cp-1, 0x1.468dadf4080abp-10 }, + { 0x1.ffe3e19349dc7p-1, 0x1.397ced7af2b15p-10 }, + { 0x1.ffe514bbdc197p-1, 0x1.2ce898809244ep-10 }, + { 0x1.ffe63b8c8b5f7p-1, 0x1.20cc76202c5fap-10 }, + { 0x1.ffe7567b7b5e1p-1, 0x1.15246dda49d47p-10 }, + { 0x1.ffe865fac722bp-1, 0x1.09ec86c75d497p-10 }, + { 0x1.ffe96a78a04a9p-1, 0x1.fe41cd9bb4eeep-11 }, + { 0x1.ffea645f6d6dap-1, 0x1.e97ba3b77f306p-11 }, + { 0x1.ffeb5415e7c44p-1, 0x1.d57f524723822p-11 }, + { 0x1.ffec39ff380b9p-1, 0x1.c245d4b998479p-11 }, + { 0x1.ffed167b12ac2p-1, 0x1.afc85e0f82e12p-11 }, + { 0x1.ffede9e5d3262p-1, 0x1.9e005769dbc1dp-11 }, + { 0x1.ffeeb49896c6dp-1, 0x1.8ce75e9f6f8a0p-11 }, + { 0x1.ffef76e956a9fp-1, 0x1.7c7744d9378f7p-11 }, + { 0x1.fff0312b010b5p-1, 0x1.6caa0d3582fe9p-11 }, + { 0x1.fff0e3ad91ec2p-1, 0x1.5d79eb71e893bp-11 }, + { 0x1.fff18ebe2b0e1p-1, 0x1.4ee1429bf7cc0p-11 }, + { 0x1.fff232a72b48ep-1, 0x1.40daa3c89f5b6p-11 }, + { 0x1.fff2cfb0453d9p-1, 0x1.3360ccd23db3ap-11 }, + { 0x1.fff3661e9569dp-1, 0x1.266ea71d4f71ap-11 }, + { 0x1.fff3f634b79f9p-1, 0x1.19ff4663ae9dfp-11 }, + { 0x1.fff48032dbe40p-1, 0x1.0e0de78654d1ep-11 }, + { 0x1.fff50456dab8cp-1, 0x1.0295ef6591848p-11 }, + { 0x1.fff582dc48d30p-1, 0x1.ef25d37f49fe1p-12 }, + { 0x1.fff5fbfc8a439p-1, 0x1.da01102b5f851p-12 }, + { 0x1.fff66feee5129p-1, 0x1.c5b5412dcafadp-12 }, + { 0x1.fff6dee89352ep-1, 0x1.b23a5a23e4210p-12 }, + { 0x1.fff7491cd4af6p-1, 0x1.9f8893d8fd1c1p-12 }, + { 0x1.fff7aebcff755p-1, 0x1.8d986a4187285p-12 }, + { 0x1.fff80ff8911fdp-1, 0x1.7c629a822bc9ep-12 }, + { 0x1.fff86cfd3e657p-1, 0x1.6be02102b3520p-12 }, + { 0x1.fff8c5f702ccfp-1, 0x1.5c0a378c90bcap-12 }, + { 0x1.fff91b102fca8p-1, 0x1.4cda5374ea275p-12 }, + { 0x1.fff96c717b695p-1, 0x1.3e4a23d1f4702p-12 }, + { 0x1.fff9ba420e834p-1, 0x1.30538fbb77ecdp-12 }, + { 0x1.fffa04a7928b1p-1, 0x1.22f0b496539bdp-12 }, + { 0x1.fffa4bc63ee9ap-1, 0x1.161be46ad3b50p-12 }, + { 0x1.fffa8fc0e5f33p-1, 0x1.09cfa445b00ffp-12 }, + { 0x1.fffad0b901755p-1, 0x1.fc0d55470cf51p-13 }, + { 0x1.fffb0ecebee1bp-1, 0x1.e577bbcd49935p-13 }, + { 0x1.fffb4a210b172p-1, 0x1.cfd4a5adec5bfp-13 }, + { 0x1.fffb82cd9dcbfp-1, 0x1.bb1a9657ce465p-13 }, + { 0x1.fffbb8f1049c6p-1, 0x1.a740684026555p-13 }, + { 0x1.fffbeca6adbe9p-1, 0x1.943d4a1d1ed39p-13 }, + { 0x1.fffc1e08f25f5p-1, 0x1.8208bc334a6a5p-13 }, + { 0x1.fffc4d3120aa1p-1, 0x1.709a8db59f25cp-13 }, + { 0x1.fffc7a37857d2p-1, 0x1.5feada379d8b7p-13 }, + { 0x1.fffca53375ce3p-1, 0x1.4ff207314a102p-13 }, + { 0x1.fffcce3b57bffp-1, 0x1.40a8c1949f75ep-13 }, + { 0x1.fffcf564ab6b7p-1, 0x1.3207fb7420eb9p-13 }, + { 0x1.fffd1ac4135f9p-1, 0x1.2408e9ba3327fp-13 }, + { 0x1.fffd3e6d5cd87p-1, 0x1.16a501f0e42cap-13 }, + { 0x1.fffd607387b07p-1, 0x1.09d5f819c9e29p-13 }, + { 0x1.fffd80e8ce0dap-1, 0x1.fb2b792b40a22p-14 }, + { 0x1.fffd9fdeabccep-1, 0x1.e3bcf436a1a95p-14 }, + { 0x1.fffdbd65e5ad0p-1, 0x1.cd55277c18d05p-14 }, + { 0x1.fffdd98e903b2p-1, 0x1.b7e94604479dcp-14 }, + { 0x1.fffdf46816833p-1, 0x1.a36eec00926ddp-14 }, + { 0x1.fffe0e0140857p-1, 0x1.8fdc1b2dcf7b9p-14 }, + { 0x1.fffe26683972ap-1, 0x1.7d2737527c3f9p-14 }, + { 0x1.fffe3daa95b18p-1, 0x1.6b4702d7d5849p-14 }, + { 0x1.fffe53d558ae9p-1, 0x1.5a329b7d30748p-14 }, + { 0x1.fffe68f4fa777p-1, 0x1.49e17724f4d41p-14 }, + { 0x1.fffe7d156d244p-1, 0x1.3a4b60ba9aa4dp-14 }, + { 0x1.fffe904222101p-1, 0x1.2b6875310f785p-14 }, + { 0x1.fffea2860ee1ep-1, 0x1.1d312098e9dbap-14 }, + { 0x1.fffeb3ebb267bp-1, 0x1.0f9e1b4dd36dfp-14 }, + { 0x1.fffec47d19457p-1, 0x1.02a8673a94691p-14 }, + { 0x1.fffed443e2787p-1, 0x1.ec929a665b449p-15 }, + { 0x1.fffee34943b15p-1, 0x1.d4f4b4c8e09edp-15 }, + { 0x1.fffef1960d85dp-1, 0x1.be6abbb10a5aap-15 }, + { 0x1.fffeff32af7afp-1, 0x1.a8e8cc1fadef6p-15 }, + { 0x1.ffff0c273bea2p-1, 0x1.94637d5bacfdbp-15 }, + { 0x1.ffff187b6bc0ep-1, 0x1.80cfdc72220cfp-15 }, + { 0x1.ffff2436a21dcp-1, 0x1.6e2367dc27f95p-15 }, + { 0x1.ffff2f5fefcaap-1, 0x1.5c540b4936fd2p-15 }, + { 0x1.ffff39fe16963p-1, 0x1.4b581b8d170fcp-15 }, + { 0x1.ffff44178c8d2p-1, 0x1.3b2652b06c2b2p-15 }, + { 0x1.ffff4db27f146p-1, 0x1.2bb5cc22e5db6p-15 }, + { 0x1.ffff56d4d5e5ep-1, 0x1.1cfe010e2052dp-15 }, + { 0x1.ffff5f8435efcp-1, 0x1.0ef6c4c84a0fep-15 }, + { 0x1.ffff67c604180p-1, 0x1.01984165a5f36p-15 }, + { 0x1.ffff6f9f67e55p-1, 0x1.e9b5e8d00ce76p-16 }, + { 0x1.ffff77154e0d6p-1, 0x1.d16f5716c6c1ap-16 }, + { 0x1.ffff7e2c6aea2p-1, 0x1.ba4f035d60e02p-16 }, + { 0x1.ffff84e93cd75p-1, 0x1.a447b7b03f045p-16 }, + { 0x1.ffff8b500e77cp-1, 0x1.8f4ccca7fc90dp-16 }, + { 0x1.ffff9164f8e46p-1, 0x1.7b5223dac7336p-16 }, + { 0x1.ffff972be5c59p-1, 0x1.684c227fcacefp-16 }, + { 0x1.ffff9ca891572p-1, 0x1.562fac4329b48p-16 }, + { 0x1.ffffa1de8c582p-1, 0x1.44f21e49054f2p-16 }, + { 0x1.ffffa6d13de73p-1, 0x1.34894a5e24657p-16 }, + { 0x1.ffffab83e54b8p-1, 0x1.24eb7254ccf83p-16 }, + { 0x1.ffffaff99bac4p-1, 0x1.160f438c70913p-16 }, + { 0x1.ffffb43555b5fp-1, 0x1.07ebd2a2d2844p-16 }, + { 0x1.ffffb839e52f3p-1, 0x1.f4f12e9ab070ap-17 }, + { 0x1.ffffbc09fa7cdp-1, 0x1.db5ad0b27805cp-17 }, + { 0x1.ffffbfa82616bp-1, 0x1.c304efa2c6f4ep-17 }, + { 0x1.ffffc316d9ed0p-1, 0x1.abe09e9144b5ep-17 }, + { 0x1.ffffc6586abf6p-1, 0x1.95df988e76644p-17 }, + { 0x1.ffffc96f1165ep-1, 0x1.80f439b4ee04bp-17 }, + { 0x1.ffffcc5cec0c1p-1, 0x1.6d11788a69c64p-17 }, + { 0x1.ffffcf23ff5fcp-1, 0x1.5a2adfa0b4bc4p-17 }, + { 0x1.ffffd1c637b2bp-1, 0x1.4834877429b8fp-17 }, + { 0x1.ffffd4456a10dp-1, 0x1.37231085c7d9ap-17 }, + { 0x1.ffffd6a3554a1p-1, 0x1.26eb9daed6f7ep-17 }, + { 0x1.ffffd8e1a2f22p-1, 0x1.1783ceac28910p-17 }, + { 0x1.ffffdb01e8546p-1, 0x1.08e1badf0fcedp-17 }, + { 0x1.ffffdd05a75eap-1, 0x1.f5f7d88472604p-18 }, + { 0x1.ffffdeee4f810p-1, 0x1.db92b5212fb8dp-18 }, + { 0x1.ffffe0bd3e852p-1, 0x1.c282cd3957edap-18 }, + { 0x1.ffffe273c15b7p-1, 0x1.aab7abace48dcp-18 }, + { 0x1.ffffe41314e06p-1, 0x1.94219bfcb4928p-18 }, + { 0x1.ffffe59c6698bp-1, 0x1.7eb1a2075864dp-18 }, + { 0x1.ffffe710d565ep-1, 0x1.6a597219a93d9p-18 }, + { 0x1.ffffe8717232dp-1, 0x1.570b69502f313p-18 }, + { 0x1.ffffe9bf4098cp-1, 0x1.44ba864670882p-18 }, + { 0x1.ffffeafb377d5p-1, 0x1.335a62115bce2p-18 }, + { 0x1.ffffec2641a9ep-1, 0x1.22df298214423p-18 }, + { 0x1.ffffed413e5b7p-1, 0x1.133d96ae7e0ddp-18 }, + { 0x1.ffffee4d01cd6p-1, 0x1.046aeabcfcdecp-18 }, + { 0x1.ffffef4a55bd4p-1, 0x1.ecb9cfe1d8642p-19 }, + { 0x1.fffff039f9e8fp-1, 0x1.d21397ead99cbp-19 }, + { 0x1.fffff11ca4876p-1, 0x1.b8d094c86d374p-19 }, + { 0x1.fffff1f302bc1p-1, 0x1.a0df0f0c626dcp-19 }, + { 0x1.fffff2bdb904dp-1, 0x1.8a2e269750a39p-19 }, + { 0x1.fffff37d63a36p-1, 0x1.74adc8f4064d3p-19 }, + { 0x1.fffff43297019p-1, 0x1.604ea819f007cp-19 }, + { 0x1.fffff4dde0118p-1, 0x1.4d0231928c6f9p-19 }, + { 0x1.fffff57fc4a95p-1, 0x1.3aba85fe22e1fp-19 }, + { 0x1.fffff618c3da6p-1, 0x1.296a70f414053p-19 }, + { 0x1.fffff6a956450p-1, 0x1.1905613b3abf2p-19 }, + { 0x1.fffff731ee681p-1, 0x1.097f6156f32c5p-19 }, + { 0x1.fffff7b2f8ed6p-1, 0x1.f59a20caf6695p-20 }, + { 0x1.fffff82cdcf1bp-1, 0x1.d9c73698fb1dcp-20 }, + { 0x1.fffff89ffc4aap-1, 0x1.bf716c6168baep-20 }, + { 0x1.fffff90cb3c81p-1, 0x1.a6852c6b58392p-20 }, + { 0x1.fffff9735b73bp-1, 0x1.8eefd70594a88p-20 }, + { 0x1.fffff9d446cccp-1, 0x1.789fb715aae95p-20 }, + { 0x1.fffffa2fc5015p-1, 0x1.6383f726a8e04p-20 }, + { 0x1.fffffa8621251p-1, 0x1.4f8c96f26a26ap-20 }, + { 0x1.fffffad7a2652p-1, 0x1.3caa61607f920p-20 }, + { 0x1.fffffb248c39dp-1, 0x1.2acee2f5ecdb8p-20 }, + { 0x1.fffffb6d1e95dp-1, 0x1.19ec60b1242edp-20 }, + { 0x1.fffffbb196132p-1, 0x1.09f5cf4dd2877p-20 }, + { 0x1.fffffbf22c1e2p-1, 0x1.f5bd95d8730d8p-21 }, + { 0x1.fffffc2f171e3p-1, 0x1.d9371e2ff7c35p-21 }, + { 0x1.fffffc688a9cfp-1, 0x1.be41de54d155ap-21 }, + { 0x1.fffffc9eb76acp-1, 0x1.a4c89e08ef4f3p-21 }, + { 0x1.fffffcd1cbc28p-1, 0x1.8cb738399b12cp-21 }, + { 0x1.fffffd01f36afp-1, 0x1.75fa8dbc84becp-21 }, + { 0x1.fffffd2f57d68p-1, 0x1.608078a70dcbcp-21 }, + { 0x1.fffffd5a2041fp-1, 0x1.4c37c0394d094p-21 }, + { 0x1.fffffd8271d12p-1, 0x1.39100d5687bfep-21 }, + { 0x1.fffffda86faa9p-1, 0x1.26f9df8519bd6p-21 }, + { 0x1.fffffdcc3b117p-1, 0x1.15e6827001f18p-21 }, + { 0x1.fffffdedf37edp-1, 0x1.05c803e4831c1p-21 }, + { 0x1.fffffe0db6b91p-1, 0x1.ed22548cffd35p-22 }, + { 0x1.fffffe2ba0ea5p-1, 0x1.d06ad6ecdf971p-22 }, + { 0x1.fffffe47ccb60p-1, 0x1.b551c847fbc96p-22 }, + { 0x1.fffffe62534d4p-1, 0x1.9bc09f112b494p-22 }, + { 0x1.fffffe7b4c81ep-1, 0x1.83a1ff0aa239dp-22 }, + { 0x1.fffffe92ced93p-1, 0x1.6ce1aa3fd7bddp-22 }, + { 0x1.fffffea8ef9cfp-1, 0x1.576c72b514859p-22 }, + { 0x1.fffffebdc2ec6p-1, 0x1.43302cc4a0da8p-22 }, + { 0x1.fffffed15bcbap-1, 0x1.301ba221dc9bbp-22 }, + { 0x1.fffffee3cc32cp-1, 0x1.1e1e857adc568p-22 }, + { 0x1.fffffef5251c2p-1, 0x1.0d2966b1746f7p-22 }, + { 0x1.ffffff0576917p-1, 0x1.fa5b4f49cc6b2p-23 }, + { 0x1.ffffff14cfb92p-1, 0x1.dc3ae30b55c16p-23 }, + { 0x1.ffffff233ee1dp-1, 0x1.bfd7555a3bd68p-23 }, + { 0x1.ffffff30d18e8p-1, 0x1.a517d9e61628ap-23 }, + { 0x1.ffffff3d9480fp-1, 0x1.8be4f8f6c951fp-23 }, + { 0x1.ffffff4993c46p-1, 0x1.74287ded49339p-23 }, + { 0x1.ffffff54dab72p-1, 0x1.5dcd669f2cd34p-23 }, + { 0x1.ffffff5f74141p-1, 0x1.48bfd38302870p-23 }, + { 0x1.ffffff6969fb8p-1, 0x1.34ecf8a3c124ap-23 }, + { 0x1.ffffff72c5fb6p-1, 0x1.22430f521cbcfp-23 }, + { 0x1.ffffff7b91176p-1, 0x1.10b1488aeb235p-23 }, + { 0x1.ffffff83d3d07p-1, 0x1.0027c00a263a6p-23 }, + { 0x1.ffffff8b962bep-1, 0x1.e12ee004efc37p-24 }, + { 0x1.ffffff92dfba2p-1, 0x1.c3e44ae32b16bp-24 }, + { 0x1.ffffff99b79d2p-1, 0x1.a854ea14102a8p-24 }, + { 0x1.ffffffa0248e8p-1, 0x1.8e6761569f45dp-24 }, + { 0x1.ffffffa62ce54p-1, 0x1.7603bac345f65p-24 }, + { 0x1.ffffffabd69b4p-1, 0x1.5f1353cdad001p-24 }, + { 0x1.ffffffb127525p-1, 0x1.4980cb3c80949p-24 }, + { 0x1.ffffffb624592p-1, 0x1.3537f00b6ad4dp-24 }, + { 0x1.ffffffbad2affp-1, 0x1.2225b12bffc68p-24 }, + { 0x1.ffffffbf370cdp-1, 0x1.10380e1adb7e9p-24 }, + { 0x1.ffffffc355dfdp-1, 0x1.febc107d5efaap-25 }, + { 0x1.ffffffc733572p-1, 0x1.df0f2a0ee6946p-25 }, + { 0x1.ffffffcad3626p-1, 0x1.c14b2188bcee4p-25 }, + { 0x1.ffffffce39b67p-1, 0x1.a553644f7f07dp-25 }, + { 0x1.ffffffd169d0cp-1, 0x1.8b0cfce0579dfp-25 }, + { 0x1.ffffffd466fa5p-1, 0x1.725e7c5dd20f7p-25 }, + { 0x1.ffffffd7344aap-1, 0x1.5b2fe547a1340p-25 }, + { 0x1.ffffffd9d4aabp-1, 0x1.456a974e92e93p-25 }, + { 0x1.ffffffdc4ad7ap-1, 0x1.30f93c3699078p-25 }, + { 0x1.ffffffde9964ep-1, 0x1.1dc7b5b978cf8p-25 }, + { 0x1.ffffffe0c2bf0p-1, 0x1.0bc30c5d52f15p-25 }, + { 0x1.ffffffe2c92dbp-1, 0x1.f5b2be65a0c7fp-26 }, + { 0x1.ffffffe4aed5ep-1, 0x1.d5f3a8dea7357p-26 }, + { 0x1.ffffffe675bbdp-1, 0x1.b82915b03515bp-26 }, + { 0x1.ffffffe81fc4ep-1, 0x1.9c3517e789488p-26 }, + { 0x1.ffffffe9aeb97p-1, 0x1.81fb7df06136ep-26 }, + { 0x1.ffffffeb24467p-1, 0x1.6961b8d641d06p-26 }, + { 0x1.ffffffec81ff2p-1, 0x1.524ec4d916caep-26 }, + { 0x1.ffffffedc95e7p-1, 0x1.3cab1343d18d1p-26 }, + { 0x1.ffffffeefbc85p-1, 0x1.2860757487a01p-26 }, + { 0x1.fffffff01a8b6p-1, 0x1.155a09065d4f7p-26 }, + { 0x1.fffffff126e1ep-1, 0x1.0384250e4c9fcp-26 }, + { 0x1.fffffff221f30p-1, 0x1.e59890b926c78p-27 }, + { 0x1.fffffff30cd3fp-1, 0x1.c642116a8a9e3p-27 }, + { 0x1.fffffff3e8892p-1, 0x1.a8e405e651ab6p-27 }, + { 0x1.fffffff4b606fp-1, 0x1.8d5f98114f872p-27 }, + { 0x1.fffffff57632dp-1, 0x1.7397c5a66e307p-27 }, + { 0x1.fffffff629e44p-1, 0x1.5b71456c5a4c4p-27 }, + { 0x1.fffffff6d1e56p-1, 0x1.44d26de513197p-27 }, + { 0x1.fffffff76ef3fp-1, 0x1.2fa31d6371537p-27 }, + { 0x1.fffffff801c1fp-1, 0x1.1bcca373b7b43p-27 }, + { 0x1.fffffff88af67p-1, 0x1.0939ab853339fp-27 }, + { 0x1.fffffff90b2e3p-1, 0x1.efac5187b2863p-28 }, + { 0x1.fffffff982fc1p-1, 0x1.cf1e86235d0e6p-28 }, + { 0x1.fffffff9f2e9fp-1, 0x1.b0a68a2128babp-28 }, + { 0x1.fffffffa5b790p-1, 0x1.9423165bc4444p-28 }, + { 0x1.fffffffabd229p-1, 0x1.7974e743dea3cp-28 }, + { 0x1.fffffffb18582p-1, 0x1.607e9eacd1050p-28 }, + { 0x1.fffffffb6d844p-1, 0x1.4924a74dec728p-28 }, + { 0x1.fffffffbbd0aap-1, 0x1.334d19e0c2160p-28 }, + { 0x1.fffffffc0748fp-1, 0x1.1edfa3c5f5ccap-28 }, + { 0x1.fffffffc4c96cp-1, 0x1.0bc56f1b54701p-28 }, + { 0x1.fffffffc8d462p-1, 0x1.f3d2185e047d9p-29 }, + { 0x1.fffffffcc9a41p-1, 0x1.d26cb87945e87p-29 }, + { 0x1.fffffffd01f89p-1, 0x1.b334fac4b9f99p-29 }, + { 0x1.fffffffd36871p-1, 0x1.96076f7918d1cp-29 }, + { 0x1.fffffffd678edp-1, 0x1.7ac2d72fc2c63p-29 }, + { 0x1.fffffffd954aep-1, 0x1.614801550319ep-29 }, + { 0x1.fffffffdbff2ap-1, 0x1.4979ac8b28926p-29 }, + { 0x1.fffffffde7ba0p-1, 0x1.333c68e2d0548p-29 }, + { 0x1.fffffffe0cd16p-1, 0x1.1e767bce37dd7p-29 }, + { 0x1.fffffffe2f664p-1, 0x1.0b0fc5b6d05a0p-29 }, + { 0x1.fffffffe4fa30p-1, 0x1.f1e3523b41d7dp-30 }, + { 0x1.fffffffe6daf7p-1, 0x1.d00de6608effep-30 }, + { 0x1.fffffffe89b0cp-1, 0x1.b0778b7b3301ap-30 }, + { 0x1.fffffffea3c9ap-1, 0x1.92fb04ec0f6cfp-30 }, + { 0x1.fffffffebc1a9p-1, 0x1.77756ec9f78fap-30 }, + { 0x1.fffffffed2c21p-1, 0x1.5dc61922d5a06p-30 }, + { 0x1.fffffffee7dc8p-1, 0x1.45ce65699ff6dp-30 }, + { 0x1.fffffffefb847p-1, 0x1.2f71a5f159970p-30 }, + { 0x1.ffffffff0dd2bp-1, 0x1.1a94ff571654fp-30 }, + { 0x1.ffffffff1ede9p-1, 0x1.071f4bbea09ecp-30 }, + { 0x1.ffffffff2ebdap-1, 0x1.e9f1ff8ddd774p-31 }, + { 0x1.ffffffff3d843p-1, 0x1.c818223a202c7p-31 }, + { 0x1.ffffffff4b453p-1, 0x1.a887bd2b4404dp-31 }, + { 0x1.ffffffff58126p-1, 0x1.8b1a336c5eb6bp-31 }, + { 0x1.ffffffff63fc3p-1, 0x1.6fab63324088ap-31 }, + { 0x1.ffffffff6f121p-1, 0x1.56197e30205bap-31 }, + { 0x1.ffffffff79626p-1, 0x1.3e44e45301b92p-31 }, + { 0x1.ffffffff82fabp-1, 0x1.281000bfe4c3fp-31 }, + { 0x1.ffffffff8be77p-1, 0x1.135f28f2d50b4p-31 }, + { 0x1.ffffffff94346p-1, 0x1.00187dded5975p-31 }, + { 0x1.ffffffff9bec8p-1, 0x1.dc479de0ef001p-32 }, + { 0x1.ffffffffa319fp-1, 0x1.bad4fdad3caa1p-32 }, + { 0x1.ffffffffa9c63p-1, 0x1.9baed3ed27ab8p-32 }, + { 0x1.ffffffffaffa4p-1, 0x1.7ead9ce4285bbp-32 }, + { 0x1.ffffffffb5be5p-1, 0x1.63ac6b4edc88ep-32 }, + { 0x1.ffffffffbb1a2p-1, 0x1.4a88be2a6390cp-32 }, + { 0x1.ffffffffc014ep-1, 0x1.332259185f1a0p-32 }, + { 0x1.ffffffffc4b56p-1, 0x1.1d5b1f3793044p-32 }, + { 0x1.ffffffffc901cp-1, 0x1.0916f04b6e18bp-32 }, + { 0x1.ffffffffccfffp-1, 0x1.ec77101de6926p-33 }, + { 0x1.ffffffffd0b56p-1, 0x1.c960bf23153e0p-33 }, + { 0x1.ffffffffd4271p-1, 0x1.a8bd20fc65ef7p-33 }, + { 0x1.ffffffffd759dp-1, 0x1.8a61745ec7d1dp-33 }, + { 0x1.ffffffffda520p-1, 0x1.6e25d0e756261p-33 }, + { 0x1.ffffffffdd13cp-1, 0x1.53e4f7d1666cbp-33 }, + { 0x1.ffffffffdfa2dp-1, 0x1.3b7c27a7ddb0ep-33 }, + { 0x1.ffffffffe202dp-1, 0x1.24caf2c32af14p-33 }, + { 0x1.ffffffffe4371p-1, 0x1.0fb3186804d0fp-33 }, + { 0x1.ffffffffe642ap-1, 0x1.f830c0bb41fd7p-34 }, + { 0x1.ffffffffe8286p-1, 0x1.d3c0f1a91c846p-34 }, + { 0x1.ffffffffe9eb0p-1, 0x1.b1e5acf351d87p-34 }, + { 0x1.ffffffffeb8d0p-1, 0x1.92712d259ce66p-34 }, + { 0x1.ffffffffed10ap-1, 0x1.7538c60a04476p-34 }, + { 0x1.ffffffffee782p-1, 0x1.5a14b04b47879p-34 }, + { 0x1.ffffffffefc57p-1, 0x1.40dfd87456f4cp-34 }, + { 0x1.fffffffff0fa7p-1, 0x1.2977b1172b9d5p-34 }, + { 0x1.fffffffff218fp-1, 0x1.13bc07e891491p-34 }, + { 0x1.fffffffff3227p-1, 0x1.ff1dbb4300811p-35 }, + { 0x1.fffffffff4188p-1, 0x1.d9a880f306bd8p-35 }, + { 0x1.fffffffff4fc9p-1, 0x1.b6e45220b55e0p-35 }, + { 0x1.fffffffff5cfdp-1, 0x1.96a0b33f2c4dap-35 }, + { 0x1.fffffffff6939p-1, 0x1.78b07e9e924acp-35 }, + { 0x1.fffffffff748ep-1, 0x1.5ce9ab1670dd2p-35 }, + { 0x1.fffffffff7f0dp-1, 0x1.4325167006bb0p-35 }, + { 0x1.fffffffff88c5p-1, 0x1.2b3e53538ff3fp-35 }, + { 0x1.fffffffff91c6p-1, 0x1.15137a7f44864p-35 }, + { 0x1.fffffffff9a1bp-1, 0x1.0084ff125639dp-35 }, + { 0x1.fffffffffa1d2p-1, 0x1.daeb0b7311ec7p-36 }, + { 0x1.fffffffffa8f6p-1, 0x1.b7937d1c40c52p-36 }, + { 0x1.fffffffffaf92p-1, 0x1.96d082f59ab06p-36 }, + { 0x1.fffffffffb5b0p-1, 0x1.7872d9fa10aadp-36 }, + { 0x1.fffffffffbb58p-1, 0x1.5c4e8e37bc7d0p-36 }, + { 0x1.fffffffffc095p-1, 0x1.423ac0df49a40p-36 }, + { 0x1.fffffffffc56dp-1, 0x1.2a117230ad284p-36 }, + { 0x1.fffffffffc9e8p-1, 0x1.13af4f04f9998p-36 }, + { 0x1.fffffffffce0dp-1, 0x1.fde703724e560p-37 }, + { 0x1.fffffffffd1e1p-1, 0x1.d77f0c82e7641p-37 }, + { 0x1.fffffffffd56cp-1, 0x1.b3ee02611d7ddp-37 }, + { 0x1.fffffffffd8b3p-1, 0x1.92ff33023d5bdp-37 }, + { 0x1.fffffffffdbbap-1, 0x1.7481a9e69f53fp-37 }, + { 0x1.fffffffffde86p-1, 0x1.5847eda620959p-37 }, + { 0x1.fffffffffe11dp-1, 0x1.3e27c1fcc74bdp-37 }, + { 0x1.fffffffffe380p-1, 0x1.25f9ee0b923dcp-37 }, + { 0x1.fffffffffe5b6p-1, 0x1.0f9a0686531ffp-37 }, + { 0x1.fffffffffe7c0p-1, 0x1.f5cc7718082afp-38 }, + { 0x1.fffffffffe9a2p-1, 0x1.cf7e53d6a2ca5p-38 }, + { 0x1.fffffffffeb60p-1, 0x1.ac0f5f3229372p-38 }, + { 0x1.fffffffffecfbp-1, 0x1.8b498644847eap-38 }, + { 0x1.fffffffffee77p-1, 0x1.6cfa9bcca59dcp-38 }, + { 0x1.fffffffffefd6p-1, 0x1.50f411d4fd2cdp-38 }, + { 0x1.ffffffffff11ap-1, 0x1.370ab8327af5ep-38 }, + { 0x1.ffffffffff245p-1, 0x1.1f167f88c6b6ep-38 }, + { 0x1.ffffffffff359p-1, 0x1.08f24085d4597p-38 }, + { 0x1.ffffffffff457p-1, 0x1.e8f70e181d619p-39 }, + { 0x1.ffffffffff542p-1, 0x1.c324c20e337dcp-39 }, + { 0x1.ffffffffff61bp-1, 0x1.a03261574b54ep-39 }, + { 0x1.ffffffffff6e3p-1, 0x1.7fe903cdf5855p-39 }, + { 0x1.ffffffffff79bp-1, 0x1.6215c58da3450p-39 }, + { 0x1.ffffffffff845p-1, 0x1.46897d4b69fc6p-39 }, + { 0x1.ffffffffff8e2p-1, 0x1.2d1877d731b7bp-39 }, + { 0x1.ffffffffff973p-1, 0x1.159a386b11517p-39 }, + { 0x1.ffffffffff9f8p-1, 0x1.ffd27ae9393cep-40 }, + { 0x1.ffffffffffa73p-1, 0x1.d7c593130dd0bp-40 }, + { 0x1.ffffffffffae4p-1, 0x1.b2cd607c79bcfp-40 }, + { 0x1.ffffffffffb4cp-1, 0x1.90ae4d3405651p-40 }, + { 0x1.ffffffffffbadp-1, 0x1.71312dd1759e2p-40 }, + { 0x1.ffffffffffc05p-1, 0x1.5422ef5d8949dp-40 }, + { 0x1.ffffffffffc57p-1, 0x1.39544b0ecc957p-40 }, + { 0x1.ffffffffffca2p-1, 0x1.20997f73e73ddp-40 }, + { 0x1.ffffffffffce7p-1, 0x1.09ca0eaacd277p-40 }, + { 0x1.ffffffffffd27p-1, 0x1.e9810295890ecp-41 }, + { 0x1.ffffffffffd62p-1, 0x1.c2b45b5aa4a1dp-41 }, + { 0x1.ffffffffffd98p-1, 0x1.9eee068fa7596p-41 }, + { 0x1.ffffffffffdcap-1, 0x1.7df2b399c10a8p-41 }, + { 0x1.ffffffffffdf8p-1, 0x1.5f8b87a31bd85p-41 }, + { 0x1.ffffffffffe22p-1, 0x1.4385c96e9a2d9p-41 }, + { 0x1.ffffffffffe49p-1, 0x1.29b2933ef4cbcp-41 }, + { 0x1.ffffffffffe6cp-1, 0x1.11e68a6378f8ap-41 }, + { 0x1.ffffffffffe8dp-1, 0x1.f7f338086a86bp-42 }, + { 0x1.ffffffffffeabp-1, 0x1.cf8d7d9ce040ap-42 }, + { 0x1.ffffffffffec7p-1, 0x1.aa577251ae484p-42 }, + { 0x1.ffffffffffee1p-1, 0x1.8811d739efb5ep-42 }, + { 0x1.ffffffffffef8p-1, 0x1.68823e52970bep-42 }, + { 0x1.fffffffffff0ep-1, 0x1.4b72ae68e8b4cp-42 }, + { 0x1.fffffffffff22p-1, 0x1.30b14dbe876bcp-42 }, + { 0x1.fffffffffff34p-1, 0x1.181012ef86610p-42 }, + { 0x1.fffffffffff45p-1, 0x1.01647ba798744p-42 }, + { 0x1.fffffffffff54p-1, 0x1.d90e917701675p-43 }, + { 0x1.fffffffffff62p-1, 0x1.b2a87e86d0c8ap-43 }, + { 0x1.fffffffffff6fp-1, 0x1.8f53dcb377293p-43 }, + { 0x1.fffffffffff7bp-1, 0x1.6ed2f2515e933p-43 }, + { 0x1.fffffffffff86p-1, 0x1.50ecc9ed47f19p-43 }, + { 0x1.fffffffffff90p-1, 0x1.356cd5ce7799ep-43 }, + { 0x1.fffffffffff9ap-1, 0x1.1c229a587ab78p-43 }, + { 0x1.fffffffffffa2p-1, 0x1.04e15ecc7f3f6p-43 }, + { 0x1.fffffffffffaap-1, 0x1.deffc7e6a6017p-44 }, + { 0x1.fffffffffffb1p-1, 0x1.b7b040832f310p-44 }, + { 0x1.fffffffffffb8p-1, 0x1.938e021f36d76p-44 }, + { 0x1.fffffffffffbep-1, 0x1.7258610b3b233p-44 }, + { 0x1.fffffffffffc3p-1, 0x1.53d3bfc82a909p-44 }, + { 0x1.fffffffffffc8p-1, 0x1.37c92babdc2fdp-44 }, + { 0x1.fffffffffffcdp-1, 0x1.1e06010120f6ap-44 }, + { 0x1.fffffffffffd1p-1, 0x1.065b9616170d4p-44 }, + { 0x1.fffffffffffd5p-1, 0x1.e13dd96b3753ap-45 }, + { 0x1.fffffffffffd9p-1, 0x1.b950d32467392p-45 }, + { 0x1.fffffffffffdcp-1, 0x1.94a72263259a5p-45 }, + { 0x1.fffffffffffdfp-1, 0x1.72fd93e036cdcp-45 }, + { 0x1.fffffffffffe2p-1, 0x1.54164576929abp-45 }, + { 0x1.fffffffffffe4p-1, 0x1.37b83c521fe96p-45 }, + { 0x1.fffffffffffe7p-1, 0x1.1daf033182e96p-45 }, + { 0x1.fffffffffffe9p-1, 0x1.05ca50205d26ap-45 }, + { 0x1.fffffffffffebp-1, 0x1.dfbb6235639fap-46 }, + { 0x1.fffffffffffedp-1, 0x1.b7807e294781fp-46 }, + { 0x1.fffffffffffeep-1, 0x1.9298add70a734p-46 }, + { 0x1.ffffffffffff0p-1, 0x1.70beaf9c7ffb6p-46 }, + { 0x1.ffffffffffff1p-1, 0x1.51b2cd6709222p-46 }, + { 0x1.ffffffffffff3p-1, 0x1.353a6cf7f7fffp-46 }, + { 0x1.ffffffffffff4p-1, 0x1.1b1fa8cbe84a7p-46 }, + { 0x1.ffffffffffff5p-1, 0x1.0330f0fd69921p-46 }, + { 0x1.ffffffffffff6p-1, 0x1.da81670f96f9bp-47 }, + { 0x1.ffffffffffff7p-1, 0x1.b24a16b4d09aap-47 }, + { 0x1.ffffffffffff7p-1, 0x1.8d6eeb6efdbd6p-47 }, + { 0x1.ffffffffffff8p-1, 0x1.6ba91ac734785p-47 }, + { 0x1.ffffffffffff9p-1, 0x1.4cb7966770ab5p-47 }, + { 0x1.ffffffffffff9p-1, 0x1.305e9721d0981p-47 }, + { 0x1.ffffffffffffap-1, 0x1.1667311fff70ap-47 }, + { 0x1.ffffffffffffbp-1, 0x1.fd3de10d62855p-48 }, + { 0x1.ffffffffffffbp-1, 0x1.d1aefbcd48d0cp-48 }, + { 0x1.ffffffffffffbp-1, 0x1.a9cc93c25aca9p-48 }, + { 0x1.ffffffffffffcp-1, 0x1.85487ee3ea735p-48 }, + { 0x1.ffffffffffffcp-1, 0x1.63daf8b4b1e0cp-48 }, + { 0x1.ffffffffffffdp-1, 0x1.45421e69a6ca1p-48 }, + { 0x1.ffffffffffffdp-1, 0x1.294175802d99ap-48 }, + { 0x1.ffffffffffffdp-1, 0x1.0fa17bf41068fp-48 }, + { 0x1.ffffffffffffdp-1, 0x1.f05e82aae2bb9p-49 }, + { 0x1.ffffffffffffep-1, 0x1.c578101b29058p-49 }, + { 0x1.ffffffffffffep-1, 0x1.9e39dc5dd2f7cp-49 }, + { 0x1.ffffffffffffep-1, 0x1.7a553a728bbf2p-49 }, + { 0x1.ffffffffffffep-1, 0x1.5982008db1304p-49 }, + { 0x1.ffffffffffffep-1, 0x1.3b7e00422e51bp-49 }, + { 0x1.ffffffffffffep-1, 0x1.200c898d9ee3ep-49 }, + { 0x1.fffffffffffffp-1, 0x1.06f5f7eb65a56p-49 }, + { 0x1.fffffffffffffp-1, 0x1.e00e9148a1d25p-50 }, + { 0x1.fffffffffffffp-1, 0x1.b623734024e92p-50 }, + { 0x1.fffffffffffffp-1, 0x1.8fd4e01891bf8p-50 }, + { 0x1.fffffffffffffp-1, 0x1.6cd44c7470d89p-50 }, + { 0x1.fffffffffffffp-1, 0x1.4cd9c04158cd7p-50 }, + { 0x1.fffffffffffffp-1, 0x1.2fa34bf5c8344p-50 }, + { 0x1.fffffffffffffp-1, 0x1.14f4890ff2461p-50 }, + { 0x1.fffffffffffffp-1, 0x1.f92c49dfa4df5p-51 }, + { 0x1.fffffffffffffp-1, 0x1.ccaaea71ab0dfp-51 }, + { 0x1.fffffffffffffp-1, 0x1.a40829f001197p-51 }, + { 0x1.0000000000000p+0, 0x1.7eef13b59e96cp-51 }, + { 0x1.0000000000000p+0, 0x1.5d11e1a252bf5p-51 }, + { 0x1.0000000000000p+0, 0x1.3e296303b2297p-51 }, + { 0x1.0000000000000p+0, 0x1.21f47009f43cep-51 }, + { 0x1.0000000000000p+0, 0x1.083768c5e4541p-51 }, + { 0x1.0000000000000p+0, 0x1.e1777d831265ep-52 }, + { 0x1.0000000000000p+0, 0x1.b69f10b0191b5p-52 }, + { 0x1.0000000000000p+0, 0x1.8f8a3a05b5b52p-52 }, + { 0x1.0000000000000p+0, 0x1.6be573c40c8e7p-52 }, + { 0x1.0000000000000p+0, 0x1.4b645ba991fdbp-52 }, + { 0x1.0000000000000p+0, 0x1.2dc119095729fp-52 }, + }, +}; diff --git a/math/aarch64/v_erfc_data.c b/math/aarch64/v_erfc_data.c new file mode 100644 index 000000000000..6acd96f74be5 --- /dev/null +++ b/math/aarch64/v_erfc_data.c @@ -0,0 +1,3507 @@ +/* + * Data used in double-precision erfc(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in vector erfc. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = ~27.0 (3488 values): + - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r), + - the second entry __v_erfc_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore + they are scaled by a large enough value 2^128 (fits in 8bit). */ +const struct v_erfc_data __v_erfc_data = { + .tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, + { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, + { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, + { 0x1.f27640f9853d9p127, 0x1.20b4d8bac36c1p128 }, + { 0x1.edf3a9ba22dadp127, 0x1.209546ad13ccfp128 }, + { 0x1.e971a2c4436aep127, 0x1.206cb4897b148p128 }, + { 0x1.e4f05010eca8cp127, 0x1.203b261cd0053p128 }, + { 0x1.e06fd58842c7ep127, 0x1.2000a00ae3804p128 }, + { 0x1.dbf056fe2df35p127, 0x1.1fbd27cdc72d3p128 }, + { 0x1.d771f82f02f4ep127, 0x1.1f70c3b4f2cc8p128 }, + { 0x1.d2f4dcbc2f894p127, 0x1.1f1b7ae44867fp128 }, + { 0x1.ce792828eae5cp127, 0x1.1ebd5552f795bp128 }, + { 0x1.c9fefdd6eaf19p127, 0x1.1e565bca400d4p128 }, + { 0x1.c58681031eb6ap127, 0x1.1de697e413d29p128 }, + { 0x1.c10fd4c26e896p127, 0x1.1d6e14099944ap128 }, + { 0x1.bc9b1bfe82687p127, 0x1.1cecdb718d61cp128 }, + { 0x1.b82879728f11ep127, 0x1.1c62fa1e869b6p128 }, + { 0x1.b3b80fa82a4bbp127, 0x1.1bd07cdd189acp128 }, + { 0x1.af4a00f426daap127, 0x1.1b357141d95d5p128 }, + { 0x1.aade6f7378a0ep127, 0x1.1a91e5a748165p128 }, + { 0x1.a6757d08215d8p127, 0x1.19e5e92b964abp128 }, + { 0x1.a20f4b5626818p127, 0x1.19318bae53a04p128 }, + { 0x1.9dabfbc090901p127, 0x1.1874ddcdfce24p128 }, + { 0x1.994baf66747adp127, 0x1.17aff0e56ec1p128 }, + { 0x1.94ee8720076b6p127, 0x1.16e2d7093cd8cp128 }, + { 0x1.9094a37bbd66ep127, 0x1.160da304ed92fp128 }, + { 0x1.8c3e24bb73372p127, 0x1.153068581b781p128 }, + { 0x1.87eb2ad1a4032p127, 0x1.144b3b337c90cp128 }, + { 0x1.839bd55eaafc8p127, 0x1.135e3075d076bp128 }, + { 0x1.7f5043ae11862p127, 0x1.12695da8b5bdep128 }, + { 0x1.7b0894b3ea35cp127, 0x1.116cd8fd67618p128 }, + { 0x1.76c4e70a390e7p127, 0x1.1068b94962e5ep128 }, + { 0x1.728558ee694fcp127, 0x1.0f5d1602f7e41p128 }, + { 0x1.6e4a083ed132fp127, 0x1.0e4a073dc1b91p128 }, + { 0x1.6a13127843ec1p127, 0x1.0d2fa5a70c168p128 }, + { 0x1.65e094b3b2413p127, 0x1.0c0e0a8223359p128 }, + { 0x1.61b2aba3da093p127, 0x1.0ae54fa490723p128 }, + { 0x1.5d89739304dcfp127, 0x1.09b58f724416bp128 }, + { 0x1.59650860d6469p127, 0x1.087ee4d9ad247p128 }, + { 0x1.5545858029b39p127, 0x1.07416b4fbfe7cp128 }, + { 0x1.512b05f5006e1p127, 0x1.05fd3ecbec298p128 }, + { 0x1.4d15a4527fdc7p127, 0x1.04b27bc403d3p128 }, + { 0x1.49057ab900447p127, 0x1.03613f2812dafp128 }, + { 0x1.44faa2d42c4ap127, 0x1.0209a65e29545p128 }, + { 0x1.40f535d93160ep127, 0x1.00abcf3e187a9p128 }, + { 0x1.3cf54c850162p127, 0x1.fe8fb01a47307p127 }, + { 0x1.38faff1aa574ap127, 0x1.fbbbbef34b4b2p127 }, + { 0x1.35066561a275dp127, 0x1.f8dc092d58ff8p127 }, + { 0x1.311796a46f064p127, 0x1.f5f0cdaf15313p127 }, + { 0x1.2d2ea9aefb636p127, 0x1.f2fa4c16c0019p127 }, + { 0x1.294bb4cd4b2bdp127, 0x1.eff8c4b1375dbp127 }, + { 0x1.256ecdca212ccp127, 0x1.ecec7870ebca8p127 }, + { 0x1.219809edbd524p127, 0x1.e9d5a8e4c934ep127 }, + { 0x1.1dc77dfcacd02p127, 0x1.e6b4982f158b9p127 }, + { 0x1.19fd3e36ac96ap127, 0x1.e38988fc46e72p127 }, + { 0x1.16395e559e218p127, 0x1.e054be79d3042p127 }, + { 0x1.127bf18c8eadcp127, 0x1.dd167c4cf9d2ap127 }, + { 0x1.0ec50a86d0dd4p127, 0x1.d9cf06898cdafp127 }, + { 0x1.0b14bb6728cd8p127, 0x1.d67ea1a8b5368p127 }, + { 0x1.076b15c70aa28p127, 0x1.d325927fb9d89p127 }, + { 0x1.03c82ab5eb831p127, 0x1.cfc41e36c7df9p127 }, + { 0x1.002c0ab8a5018p127, 0x1.cc5a8a3fbea4p127 }, + { 0x1.f92d8b91d5cc7p126, 0x1.c8e91c4d01368p127 }, + { 0x1.f210d6a9a6a31p126, 0x1.c5701a484ef9dp127 }, + { 0x1.eb02147ce245cp126, 0x1.c1efca49a5011p127 }, + { 0x1.e40161b701275p126, 0x1.be68728e29d5ep127 }, + { 0x1.dd0ed9ea4bdd6p126, 0x1.bada596f25436p127 }, + { 0x1.d62a978f7c957p126, 0x1.b745c55905bf8p127 }, + { 0x1.cf54b4058455fp126, 0x1.b3aafcc27502ep127 }, + { 0x1.c88d479173ccep126, 0x1.b00a46237d5bep127 }, + { 0x1.c1d4695e87644p126, 0x1.ac63e7ecc1411p127 }, + { 0x1.bb2a2f7e5652p126, 0x1.a8b8287ec6a09p127 }, + { 0x1.b48eaee924501p126, 0x1.a5074e215762p127 }, + { 0x1.ae01fb7e55a66p126, 0x1.a1519efaf889ep127 }, + { 0x1.a78428050527ep126, 0x1.9d97610879642p127 }, + { 0x1.a115462cbbc17p126, 0x1.99d8da149c13fp127 }, + { 0x1.9ab5668e4930ap126, 0x1.96164fafd8de3p127 }, + { 0x1.946498acbd766p126, 0x1.925007283d7aap127 }, + { 0x1.8e22eaf68291ep126, 0x1.8e86458169af8p127 }, + { 0x1.87f06ac6960c4p126, 0x1.8ab94f6caa71dp127 }, + { 0x1.81cd2465e1d96p126, 0x1.86e9694134b9ep127 }, + { 0x1.7bb9230cb40b4p126, 0x1.8316d6f48133dp127 }, + { 0x1.75b470e454d35p126, 0x1.7f41dc12c9e89p127 }, + { 0x1.6fbf1708ba47cp126, 0x1.7b6abbb7aaf19p127 }, + { 0x1.69d91d8a595dap126, 0x1.7791b886e7403p127 }, + { 0x1.64028b7013867p126, 0x1.73b714a552763p127 }, + { 0x1.5e3b66b9405a9p126, 0x1.6fdb11b1e0c34p127 }, + { 0x1.5883b45fd2b63p126, 0x1.6bfdf0beddaf5p127 }, + { 0x1.52db785a98acap126, 0x1.681ff24b4ab04p127 }, + { 0x1.4d42b59f95afap126, 0x1.6441563c665d4p127 }, + { 0x1.47b96e267647ap126, 0x1.60625bd75d07bp127 }, + { 0x1.423fa2eb1cb59p126, 0x1.5c8341bb23767p127 }, + { 0x1.3cd553f045d45p126, 0x1.58a445da7c74cp127 }, + { 0x1.377a8042458d1p126, 0x1.54c5a57629dbp127 }, + { 0x1.322f25f9da2fdp126, 0x1.50e79d1749ac9p127 }, + { 0x1.2cf3423f15fdfp126, 0x1.4d0a6889dfd9fp127 }, + { 0x1.27c6d14c5e341p126, 0x1.492e42d78d2c5p127 }, + { 0x1.22a9ce717edcbp126, 0x1.4553664273d24p127 }, + { 0x1.1d9c3416d2b4bp126, 0x1.417a0c4049fdp127 }, + { 0x1.189dfbc07e69p126, 0x1.3da26d759aef5p127 }, + { 0x1.13af1e11be721p126, 0x1.39ccc1b136d5ap127 }, + { 0x1.0ecf92d046d22p126, 0x1.35f93fe7d1b3dp127 }, + { 0x1.09ff50e7b3f93p126, 0x1.32281e2fd1a92p127 }, + { 0x1.053e4e6d0c10bp126, 0x1.2e5991bd4cbfcp127 }, + { 0x1.008c80a24ff1p126, 0x1.2a8dcede3673bp127 }, + { 0x1.f7d3b7f436013p125, 0x1.26c508f6bd0ffp127 }, + { 0x1.eeaca836a27ccp125, 0x1.22ff727dd6f7bp127 }, + { 0x1.e5a3b7c9b56dap125, 0x1.1f3d3cf9ffe5ap127 }, + { 0x1.dcb8cae2d747fp125, 0x1.1b7e98fe26217p127 }, + { 0x1.d3ebc436b0f26p125, 0x1.17c3b626c7a12p127 }, + { 0x1.cb3c8500ea349p125, 0x1.140cc3173f007p127 }, + { 0x1.c2aaed0bfcfeep125, 0x1.1059ed7740313p127 }, + { 0x1.ba36dab91c0e9p125, 0x1.0cab61f084b93p127 }, + { 0x1.b1e02b082b72p125, 0x1.09014c2ca74dap127 }, + { 0x1.a9a6b99fc973bp125, 0x1.055bd6d32e8d7p127 }, + { 0x1.a18a60d56673ep125, 0x1.01bb2b87c6968p127 }, + { 0x1.998af9b56a3aep125, 0x1.fc3ee5d1524bp126 }, + { 0x1.91a85c0b65519p125, 0x1.f511a91a67d2ap126 }, + { 0x1.89e25e6a4cef9p125, 0x1.edeeee0959518p126 }, + { 0x1.8238d634c0127p125, 0x1.e6d6ffaa65a25p126 }, + { 0x1.7aab97a554544p125, 0x1.dfca26f5bbf88p126 }, + { 0x1.733a75d6e91b8p125, 0x1.d8c8aace11e63p126 }, + { 0x1.6be542ccffc2fp125, 0x1.d1d2cfff91594p126 }, + { 0x1.64abcf7c175b4p125, 0x1.cae8d93f1d7b7p126 }, + { 0x1.5d8debd20aacep125, 0x1.c40b0729ed548p126 }, + { 0x1.568b66be6f268p125, 0x1.bd3998457afdbp126 }, + { 0x1.4fa40e3af3674p125, 0x1.b674c8ffc6283p126 }, + { 0x1.48d7af53bc19fp125, 0x1.afbcd3afe8ab6p126 }, + { 0x1.4226162fbddd5p125, 0x1.a911f096fbc26p126 }, + { 0x1.3b8f0e1912f7p125, 0x1.a27455e14c93cp126 }, + { 0x1.351261854b991p125, 0x1.9be437a7de946p126 }, + { 0x1.2eafda1db784ap125, 0x1.9561c7f23a47bp126 }, + { 0x1.286740c7a7dabp125, 0x1.8eed36b886d93p126 }, + { 0x1.22385daca7f47p125, 0x1.8886b1e5ecfd1p126 }, + { 0x1.1c22f842ac1f2p125, 0x1.822e655b417e7p126 }, + { 0x1.1626d7543522p125, 0x1.7be47af1f5d89p126 }, + { 0x1.1043c1086777dp125, 0x1.75a91a7f4d2edp126 }, + { 0x1.0a797aeb152f2p125, 0x1.6f7c69d7d3ef8p126 }, + { 0x1.04c7c9f4b969p125, 0x1.695e8cd31867ep126 }, + { 0x1.fe5ce524c8ee5p124, 0x1.634fa54fa285fp126 }, + { 0x1.f35a715b2f3e1p124, 0x1.5d4fd33729015p126 }, + { 0x1.e887bf681f218p124, 0x1.575f3483021c3p126 }, + { 0x1.dde4553ef94dep124, 0x1.517de540ce2a3p126 }, + { 0x1.d36fb7fa50177p124, 0x1.4babff975a04cp126 }, + { 0x1.c9296beb09cf1p124, 0x1.45e99bcbb7915p126 }, + { 0x1.bf10f4a759889p124, 0x1.4036d0468a7a2p126 }, + { 0x1.b525d5198cb1cp124, 0x1.3a93b1998736cp126 }, + { 0x1.ab678f8eabedbp124, 0x1.35005285227f1p126 }, + { 0x1.a1d5a5c4edb96p124, 0x1.2f7cc3fe6f423p126 }, + { 0x1.986f98f9f96c8p124, 0x1.2a09153529381p126 }, + { 0x1.8f34e9f8f93a6p124, 0x1.24a55399ea239p126 }, + { 0x1.8625192879e39p124, 0x1.1f518ae487dc8p126 }, + { 0x1.7d3fa69816db5p124, 0x1.1a0dc51a9934dp126 }, + { 0x1.7484120df1b01p124, 0x1.14da0a961fd14p126 }, + { 0x1.6bf1db13f3983p124, 0x1.0fb6620c550afp126 }, + { 0x1.63888104d811ap124, 0x1.0aa2d09497f2bp126 }, + { 0x1.5b478318ff939p124, 0x1.059f59af7a906p126 }, + { 0x1.532e6073095f2p124, 0x1.00abff4dec7a3p126 }, + { 0x1.4b3c982c338c7p124, 0x1.f79183b101c5bp125 }, + { 0x1.4371a960807f8p124, 0x1.edeb406d9c825p125 }, + { 0x1.3bcd133aa0ffcp124, 0x1.e4652fadcb6b2p125 }, + { 0x1.344e54ffa23b9p124, 0x1.daff4969c0b04p125 }, + { 0x1.2cf4ee1a5f0fcp124, 0x1.d1b982c50137p125 }, + { 0x1.25c05e26b3f99p124, 0x1.c893ce1dcbef7p125 }, + { 0x1.1eb024fc75285p124, 0x1.bf8e1b1ca2279p125 }, + { 0x1.17c3c2ba26319p124, 0x1.b6a856c3ed54fp125 }, + { 0x1.10fab7cf72f94p124, 0x1.ade26b7fbed95p125 }, + { 0x1.0a548507696cp124, 0x1.a53c4135a6526p125 }, + { 0x1.03d0ab9273b94p124, 0x1.9cb5bd549b111p125 }, + { 0x1.fadd5a20258d3p123, 0x1.944ec2e4f563p125 }, + { 0x1.ee5c1730b147cp123, 0x1.8c07329874652p125 }, + { 0x1.e21c938a45a83p123, 0x1.83deeada4d25ap125 }, + { 0x1.d61dd57628999p123, 0x1.7bd5c7df3fe9cp125 }, + { 0x1.ca5ee4649e31fp123, 0x1.73eba3b5b07b7p125 }, + { 0x1.bedec8fddb34p123, 0x1.6c205655be72p125 }, + { 0x1.b39c8d3276d8ap123, 0x1.6473b5b15a7a1p125 }, + { 0x1.a8973c4b5c03ep123, 0x1.5ce595c455b0ap125 }, + { 0x1.9dcde2f93a207p123, 0x1.5575c8a468362p125 }, + { 0x1.933f8f6375f2cp123, 0x1.4e241e912c305p125 }, + { 0x1.88eb51369acb9p123, 0x1.46f066040a832p125 }, + { 0x1.7ed039b24c96bp123, 0x1.3fda6bc016994p125 }, + { 0x1.74ed5bb6bb581p123, 0x1.38e1fae1d6a9dp125 }, + { 0x1.6b41cbd198bc8p123, 0x1.3206dceef5f87p125 }, + { 0x1.61cca04a90795p123, 0x1.2b48d9e5dea1cp125 }, + { 0x1.588cf12f4446bp123, 0x1.24a7b84d38971p125 }, + { 0x1.4f81d85ecc55bp123, 0x1.1e233d434b813p125 }, + { 0x1.46aa7194bd324p123, 0x1.17bb2c8d41535p125 }, + { 0x1.3e05da73b4159p123, 0x1.116f48a6476ccp125 }, + { 0x1.3593328f6abbep123, 0x1.0b3f52ce8c383p125 }, + { 0x1.2d519b7653e1ep123, 0x1.052b0b1a174eap125 }, + { 0x1.254038bac19d6p123, 0x1.fe6460fef468p124 }, + { 0x1.1d5e2ffb96d4p123, 0x1.f2a901ccafb37p124 }, + { 0x1.15aaa8ec85205p123, 0x1.e723726b824a9p124 }, + { 0x1.0e24cd5dd8846p123, 0x1.dbd32ac4c99bp124 }, + { 0x1.06cbc943d255ap123, 0x1.d0b7a0f921e7cp124 }, + { 0x1.ff3d957b29b39p122, 0x1.c5d0497c09e74p124 }, + { 0x1.f13a043742333p122, 0x1.bb1c972f23e5p124 }, + { 0x1.e38b43cbd0f0fp122, 0x1.b09bfb7d11a84p124 }, + { 0x1.d62fbdc2e756bp122, 0x1.a64de673e8837p124 }, + { 0x1.c925e02b41668p122, 0x1.9c31c6df3b1b8p124 }, + { 0x1.bc6c1da1f3121p122, 0x1.92470a61b6965p124 }, + { 0x1.b000ed5b4a626p122, 0x1.888d1d8e510a3p124 }, + { 0x1.a3e2cb2ae9edbp122, 0x1.7f036c0107294p124 }, + { 0x1.9810378b1f299p122, 0x1.75a96077274bap124 }, + { 0x1.8c87b7a37834fp122, 0x1.6c7e64e7281cbp124 }, + { 0x1.8147d54e9cc33p122, 0x1.6381e2980956bp124 }, + { 0x1.764f1f1f6ddeap122, 0x1.5ab342383d178p124 }, + { 0x1.6b9c28657041ap122, 0x1.5211ebf41880bp124 }, + { 0x1.612d893085125p122, 0x1.499d478bca735p124 }, + { 0x1.5701de53f4d2ep122, 0x1.4154bc68d75c3p124 }, + { 0x1.4d17c968d062bp122, 0x1.3937b1b31925ap124 }, + { 0x1.436df0cfabf1dp122, 0x1.31458e6542847p124 }, + { 0x1.3a02ffb1b7ceep122, 0x1.297db960e4f63p124 }, + { 0x1.30d5a6013afc5p122, 0x1.21df9981f8e53p124 }, + { 0x1.27e49879737d3p122, 0x1.1a6a95b1e786fp124 }, + { 0x1.1f2e909de04d2p122, 0x1.131e14fa1625dp124 }, + { 0x1.16b24cb8f8f92p122, 0x1.0bf97e95f2a64p124 }, + { 0x1.0e6e8fda56cf7p122, 0x1.04fc3a0481321p124 }, + { 0x1.066221d4539d8p122, 0x1.fc4b5e32d6259p123 }, + { 0x1.fd179e7243e3cp121, 0x1.eeea8c1b1db94p123 }, + { 0x1.edd4d2aec5adbp121, 0x1.e1d4cf1e2450ap123 }, + { 0x1.def98c6c79efap121, 0x1.d508f9a1ea64fp123 }, + { 0x1.d0838121f2418p121, 0x1.c885df3451a07p123 }, + { 0x1.c2706fa45005ep121, 0x1.bc4a54a84e834p123 }, + { 0x1.b4be201caa4b4p121, 0x1.b055303221015p123 }, + { 0x1.a76a63fc95c79p121, 0x1.a4a549829587ep123 }, + { 0x1.9a7315f1d6a55p121, 0x1.993979e14fffep123 }, + { 0x1.8dd619d943ca1p121, 0x1.8e109c4622913p123 }, + { 0x1.81915cb0e3323p121, 0x1.83298d717210ep123 }, + { 0x1.75a2d48946eb1p121, 0x1.78832c03aa2b1p123 }, + { 0x1.6a08807632262p121, 0x1.6e1c5893c380bp123 }, + { 0x1.5ec0687e8dcb2p121, 0x1.63f3f5c4de13bp123 }, + { 0x1.53c89d8bb3ddbp121, 0x1.5a08e85af27ep123 }, + { 0x1.491f395818f54p121, 0x1.505a174e9c929p123 }, + { 0x1.3ec25e5d5af12p121, 0x1.46e66be00224p123 }, + { 0x1.34b037c1bbfc5p121, 0x1.3dacd1a8d8ccep123 }, + { 0x1.2ae6f94510dd8p121, 0x1.34ac36ad8dafep123 }, + { 0x1.2164df2d29765p121, 0x1.2be38b6d92415p123 }, + { 0x1.18282e31ba3e8p121, 0x1.2351c2f2d1449p123 }, + { 0x1.0f2f3367cd6aap121, 0x1.1af5d2e04f3f6p123 }, + { 0x1.0678442cc256fp121, 0x1.12ceb37ff9bc3p123 }, + { 0x1.fc037c21c3622p120, 0x1.0adb5fcfa8c75p123 }, + { 0x1.eb940d8319831p120, 0x1.031ad58d56279p123 }, + { 0x1.db9f17e61c31p120, 0x1.f7182a851bca2p122 }, + { 0x1.cc218694238a2p120, 0x1.e85c449e377f3p122 }, + { 0x1.bd18548996419p120, 0x1.da0005e5f28dfp122 }, + { 0x1.ae808c479c371p120, 0x1.cc0180af00a8bp122 }, + { 0x1.a05747a543aa7p120, 0x1.be5ecd2fcb5f9p122 }, + { 0x1.9299afa0246a6p120, 0x1.b1160991ff737p122 }, + { 0x1.8544fc2c8c1dap120, 0x1.a4255a00b9f03p122 }, + { 0x1.785674053e8b9p120, 0x1.978ae8b55ce1bp122 }, + { 0x1.6bcb6c7ad4854p120, 0x1.8b44e6031383ep122 }, + { 0x1.5fa14942c3d54p120, 0x1.7f5188610ddc8p122 }, + { 0x1.53d57c461a5a7p120, 0x1.73af0c737bb45p122 }, + { 0x1.4865856ff632ap120, 0x1.685bb5134ef13p122 }, + { 0x1.3d4ef27bc49a6p120, 0x1.5d55cb54cd53ap122 }, + { 0x1.328f5ec350e67p120, 0x1.529b9e8cf9a1ep122 }, + { 0x1.2824730cacbb4p120, 0x1.482b8455dc491p122 }, + { 0x1.1e0be557fa673p120, 0x1.3e03d891b37dep122 }, + { 0x1.144378ad22027p120, 0x1.3422fd6d12e2bp122 }, + { 0x1.0ac8fce979b96p120, 0x1.2a875b5ffab56p122 }, + { 0x1.019a4e8d69649p120, 0x1.212f612dee7fbp122 }, + { 0x1.f16aad1422a55p119, 0x1.181983e5133ddp122 }, + { 0x1.e030141df7d25p119, 0x1.0f443edc5ce49p122 }, + { 0x1.cf80d4afc3019p119, 0x1.06ae13b0d3255p122 }, + { 0x1.bf5908f50b4ap119, 0x1.fcab1483ea7fcp121 }, + { 0x1.afb4e269693dfp119, 0x1.ec72615a894c4p121 }, + { 0x1.a090a974cfebep119, 0x1.dcaf3691fc448p121 }, + { 0x1.91e8bd0830a74p119, 0x1.cd5ec93c12432p121 }, + { 0x1.83b9923a85f7bp119, 0x1.be7e5ac24963bp121 }, + { 0x1.75ffb3e6519ap119, 0x1.b00b38d6b3575p121 }, + { 0x1.68b7c2479902dp119, 0x1.a202bd6372dcep121 }, + { 0x1.5bde729a6b60fp119, 0x1.94624e78e0fafp121 }, + { 0x1.4f708eb9fba63p119, 0x1.87275e3a6869ep121 }, + { 0x1.436af4c058acbp119, 0x1.7a4f6aca256cbp121 }, + { 0x1.37ca96a6cd1d4p119, 0x1.6dd7fe335823p121 }, + { 0x1.2c8c79e6f04a3p119, 0x1.61beae53b72b7p121 }, + { 0x1.21adb71c70c75p119, 0x1.56011cc3b036dp121 }, + { 0x1.172b79a7a1181p119, 0x1.4a9cf6bda3f4cp121 }, + { 0x1.0d02ff50ce651p119, 0x1.3f8ff5042a88ep121 }, + { 0x1.033197ec68c0ep119, 0x1.34d7dbc76d7e5p121 }, + { 0x1.f3694a0008381p118, 0x1.2a727a89a3f14p121 }, + { 0x1.e11332d0714c5p118, 0x1.205dac02bd6b9p121 }, + { 0x1.cf5bf1fed1e7p118, 0x1.1697560347b26p121 }, + { 0x1.be3eb08ae7c2p118, 0x1.0d1d69569b82dp121 }, + { 0x1.adb6b810af9e2p118, 0x1.03ede1a45bfeep121 }, + { 0x1.9dbf721b98dfap118, 0x1.f60d8aa2a88f2p120 }, + { 0x1.8e54677bb0151p118, 0x1.e4cc4abf7d065p120 }, + { 0x1.7f713f9cc9784p118, 0x1.d4143a9dfe965p120 }, + { 0x1.7111bfdfb3cep118, 0x1.c3e1a5f5c077cp120 }, + { 0x1.6331caf57b5dbp118, 0x1.b430ecf4a83a8p120 }, + { 0x1.55cd603cc415p118, 0x1.a4fe83fb9db25p120 }, + { 0x1.48e09b21414bfp118, 0x1.9646f35a76624p120 }, + { 0x1.3c67b27d50fe7p118, 0x1.8806d70b2fc36p120 }, + { 0x1.305ef7fdbfb95p118, 0x1.7a3ade6c8b3e5p120 }, + { 0x1.24c2d787b9e37p118, 0x1.6cdfcbfc1e263p120 }, + { 0x1.198fd6a0ee7bdp118, 0x1.5ff2750fe782p120 }, + { 0x1.0ec293d9e6d85p118, 0x1.536fc18f7ce5cp120 }, + { 0x1.0457c63a9669p118, 0x1.4754abacdf1dcp120 }, + { 0x1.f49879624a021p117, 0x1.3b9e3f9d06e3fp120 }, + { 0x1.e139bb05eb49ep117, 0x1.30499b503957fp120 }, + { 0x1.ce8d4b7fd6c7p117, 0x1.2553ee2a336bfp120 }, + { 0x1.bc8d516fda8bap117, 0x1.1aba78ba3af89p120 }, + { 0x1.ab341ee553e25p117, 0x1.107a8c7323a6ep120 }, + { 0x1.9a7c305336484p117, 0x1.06918b6355624p120 }, + { 0x1.8a602b88919cp117, 0x1.f9f9cfd9c3035p119 }, + { 0x1.7adadead962edp117, 0x1.e77448fb66bb9p119 }, + { 0x1.6be73f45149fbp117, 0x1.d58da68fd117p119 }, + { 0x1.5d80693276a6dp117, 0x1.c4412bf4b8f0bp119 }, + { 0x1.4fa19dc42d409p117, 0x1.b38a3af2e55b4p119 }, + { 0x1.424642c28ff75p117, 0x1.a3645330550ffp119 }, + { 0x1.3569e18328604p117, 0x1.93cb11a30d765p119 }, + { 0x1.29082600643fdp117, 0x1.84ba3004a50dp119 }, + { 0x1.1d1cddf5a82dep117, 0x1.762d84469c18fp119 }, + { 0x1.11a3f7ffbbfeap117, 0x1.6821000795a03p119 }, + { 0x1.069982c189a9ep117, 0x1.5a90b00981d93p119 }, + { 0x1.f7f3581a4dc2cp116, 0x1.4d78bba8ca5fdp119 }, + { 0x1.e381802242163p116, 0x1.40d564548fad7p119 }, + { 0x1.cfd6511405b2dp116, 0x1.34a305080681fp119 }, + { 0x1.bcead7f01492fp116, 0x1.28de11c5031ebp119 }, + { 0x1.aab859b20ac9ep116, 0x1.1d83170fbf6fbp119 }, + { 0x1.993851cc9779ap116, 0x1.128eb96be8798p119 }, + { 0x1.886470ad946a7p116, 0x1.07fdb4dafea5fp119 }, + { 0x1.78369a4a2cbd6p116, 0x1.fb99b8b8279e1p118 }, + { 0x1.68a8e4b2fc8c2p116, 0x1.e7f232d9e263p118 }, + { 0x1.59b596b012aaap116, 0x1.d4fed7195d7e8p118 }, + { 0x1.4b572664bd2dcp116, 0x1.c2b9cf7f893bfp118 }, + { 0x1.3d8837fb08d1dp116, 0x1.b11d702b3deb2p118 }, + { 0x1.30439c56dadf6p116, 0x1.a024365f771bdp118 }, + { 0x1.23844fd08cb93p116, 0x1.8fc8c794b03b5p118 }, + { 0x1.174578f6efd5dp116, 0x1.8005f08d6f1efp118 }, + { 0x1.0b826758a086bp116, 0x1.70d6a46e07ddap118 }, + { 0x1.003692548d98bp116, 0x1.6235fbd7a4345p118 }, + { 0x1.eabb2fe335196p115, 0x1.541f340697987p118 }, + { 0x1.d5e6777a83c2ap115, 0x1.468dadf4080abp118 }, + { 0x1.c1e6cb6239574p115, 0x1.397ced7af2b15p118 }, + { 0x1.aeb4423e690e7p115, 0x1.2ce898809244ep118 }, + { 0x1.9c47374a0974ep115, 0x1.20cc76202c5fbp118 }, + { 0x1.8a98484a1e8d3p115, 0x1.15246dda49d47p118 }, + { 0x1.79a0538dd4fc7p115, 0x1.09ec86c75d497p118 }, + { 0x1.695875fb574ap115, 0x1.fe41cd9bb4eeep117 }, + { 0x1.59ba0929261c5p115, 0x1.e97ba3b77f306p117 }, + { 0x1.4abea183bc47p115, 0x1.d57f524723822p117 }, + { 0x1.3c600c7f477c5p115, 0x1.c245d4b99847ap117 }, + { 0x1.2e984ed53e777p115, 0x1.afc85e0f82e12p117 }, + { 0x1.2161a2cd9d894p115, 0x1.9e005769dbc1dp117 }, + { 0x1.14b67693928cfp115, 0x1.8ce75e9f6f8ap117 }, + { 0x1.08916a956172p115, 0x1.7c7744d9378f7p117 }, + { 0x1.f9da9fde95755p114, 0x1.6caa0d3582fe9p117 }, + { 0x1.e38a4dc27b11bp114, 0x1.5d79eb71e893bp117 }, + { 0x1.ce283a9e3e33p114, 0x1.4ee1429bf7ccp117 }, + { 0x1.b9ab1a96e3b3ep114, 0x1.40daa3c89f5b6p117 }, + { 0x1.a609f7584d32bp114, 0x1.3360ccd23db3ap117 }, + { 0x1.933c2d52c56c9p114, 0x1.266ea71d4f71ap117 }, + { 0x1.8139690c0d187p114, 0x1.19ff4663ae9dfp117 }, + { 0x1.6ff9a4837fa43p114, 0x1.0e0de78654d1ep117 }, + { 0x1.5f7524a8e81a2p114, 0x1.0295ef6591848p117 }, + { 0x1.4fa476e59f668p114, 0x1.ef25d37f49fe1p116 }, + { 0x1.40806eb78e353p114, 0x1.da01102b5f851p116 }, + { 0x1.3202235dada5p114, 0x1.c5b5412dcafadp116 }, + { 0x1.2422ed95a3235p114, 0x1.b23a5a23e421p116 }, + { 0x1.16dc656a14df6p114, 0x1.9f8893d8fd1c1p116 }, + { 0x1.0a2860115569cp114, 0x1.8d986a4187285p116 }, + { 0x1.fc01dbb80c841p113, 0x1.7c629a822bc9ep116 }, + { 0x1.e4c0b066a497p113, 0x1.6be02102b352p116 }, + { 0x1.ce823f4cc4badp113, 0x1.5c0a378c90bcap116 }, + { 0x1.b93bf40d5eccbp113, 0x1.4cda5374ea275p116 }, + { 0x1.a4e3a125adc76p113, 0x1.3e4a23d1f4703p116 }, + { 0x1.916f7c5f2f764p113, 0x1.30538fbb77ecdp116 }, + { 0x1.7ed61b5d3db0ap113, 0x1.22f0b496539bep116 }, + { 0x1.6d0e7045988cbp113, 0x1.161be46ad3b5p116 }, + { 0x1.5c0fc68335b0cp113, 0x1.09cfa445b00ffp116 }, + { 0x1.4bd1bfa2aba3dp113, 0x1.fc0d55470cf51p115 }, + { 0x1.3c4c504792bf8p113, 0x1.e577bbcd49935p115 }, + { 0x1.2d77bd3a382bcp113, 0x1.cfd4a5adec5cp115 }, + { 0x1.1f4c988d02149p113, 0x1.bb1a9657ce465p115 }, + { 0x1.11c3bed8e716ap113, 0x1.a740684026555p115 }, + { 0x1.04d654905dadp113, 0x1.943d4a1d1ed39p115 }, + { 0x1.f0fb86d056745p112, 0x1.8208bc334a6a5p115 }, + { 0x1.d9676faafa27fp112, 0x1.709a8db59f25cp115 }, + { 0x1.c2e43d417197bp112, 0x1.5feada379d8b7p115 }, + { 0x1.ad664518e771bp112, 0x1.4ff207314a102p115 }, + { 0x1.98e25420092dap112, 0x1.40a8c1949f75ep115 }, + { 0x1.854daa4a49b0fp112, 0x1.3207fb7420eb9p115 }, + { 0x1.729df6503422ap112, 0x1.2408e9ba3327fp115 }, + { 0x1.60c95193c542dp112, 0x1.16a501f0e42cap115 }, + { 0x1.4fc63c27c71aep112, 0x1.09d5f819c9e29p115 }, + { 0x1.3f8b98f93052ap112, 0x1.fb2b792b40a22p114 }, + { 0x1.3010aa198de78p112, 0x1.e3bcf436a1a95p114 }, + { 0x1.214d0d298365p112, 0x1.cd55277c18d05p114 }, + { 0x1.1338b7e273194p112, 0x1.b7e94604479dcp114 }, + { 0x1.05cbf4be650abp112, 0x1.a36eec00926ddp114 }, + { 0x1.f1febf7a916aap111, 0x1.8fdc1b2dcf7b9p114 }, + { 0x1.d997c68d65936p111, 0x1.7d2737527c3f9p114 }, + { 0x1.c2556a4e7a90fp111, 0x1.6b4702d7d5849p114 }, + { 0x1.ac2aa7516ade4p111, 0x1.5a329b7d30748p114 }, + { 0x1.970b05888fda2p111, 0x1.49e17724f4d41p114 }, + { 0x1.82ea92dbc1a27p111, 0x1.3a4b60ba9aa4ep114 }, + { 0x1.6fbdddeff308fp111, 0x1.2b6875310f785p114 }, + { 0x1.5d79f11e27f6bp111, 0x1.1d312098e9dbap114 }, + { 0x1.4c144d984e1b8p111, 0x1.0f9e1b4dd36dfp114 }, + { 0x1.3b82e6ba892a4p111, 0x1.02a8673a94692p114 }, + { 0x1.2bbc1d878d272p111, 0x1.ec929a665b449p113 }, + { 0x1.1cb6bc4eaa678p111, 0x1.d4f4b4c8e09edp113 }, + { 0x1.0e69f27a37df3p111, 0x1.be6abbb10a5aap113 }, + { 0x1.00cd508511266p111, 0x1.a8e8cc1fadef6p113 }, + { 0x1.e7b1882bccac5p110, 0x1.94637d5bacfdbp113 }, + { 0x1.cf09287e48bb9p110, 0x1.80cfdc72220cfp113 }, + { 0x1.b792bbc489b04p110, 0x1.6e2367dc27f95p113 }, + { 0x1.a140206ab945p110, 0x1.5c540b4936fd2p113 }, + { 0x1.8c03d2d39119bp110, 0x1.4b581b8d170fcp113 }, + { 0x1.77d0e6e5bed21p110, 0x1.3b2652b06c2b2p113 }, + { 0x1.649b01d73110ap110, 0x1.2bb5cc22e5db6p113 }, + { 0x1.525654343aad2p110, 0x1.1cfe010e2052dp113 }, + { 0x1.40f79420887c7p110, 0x1.0ef6c4c84a0fep113 }, + { 0x1.3073f7cff4a85p110, 0x1.01984165a5f36p113 }, + { 0x1.20c1303550f0ep110, 0x1.e9b5e8d00ce77p112 }, + { 0x1.11d563e54f40ep110, 0x1.d16f5716c6c1ap112 }, + { 0x1.03a72a2bbdc06p110, 0x1.ba4f035d60e03p112 }, + { 0x1.ec5b0ca2b20f5p109, 0x1.a447b7b03f045p112 }, + { 0x1.d2bfc6210880ap109, 0x1.8f4ccca7fc90dp112 }, + { 0x1.ba6c1c6e87c4p109, 0x1.7b5223dac7336p112 }, + { 0x1.a35068e9c89cfp109, 0x1.684c227fcacefp112 }, + { 0x1.8d5dbaa383b98p109, 0x1.562fac4329b48p112 }, + { 0x1.7885ce9f67cdbp109, 0x1.44f21e49054f2p112 }, + { 0x1.64bb0863504ddp109, 0x1.34894a5e24657p112 }, + { 0x1.51f06ad20e4c3p109, 0x1.24eb7254ccf83p112 }, + { 0x1.4019914f0b53ap109, 0x1.160f438c70913p112 }, + { 0x1.2f2aa92823e8p109, 0x1.07ebd2a2d2844p112 }, + { 0x1.1f186b432c98bp109, 0x1.f4f12e9ab070ap111 }, + { 0x1.0fd8160ca94ap109, 0x1.db5ad0b27805cp111 }, + { 0x1.015f67a552924p109, 0x1.c304efa2c6f4ep111 }, + { 0x1.e749309831666p108, 0x1.abe09e9144b5ep111 }, + { 0x1.cd3caa04cdd1bp108, 0x1.95df988e76644p111 }, + { 0x1.b48774d0f8e45p108, 0x1.80f439b4ee04bp111 }, + { 0x1.9d189f9f85cbfp108, 0x1.6d11788a69c64p111 }, + { 0x1.86e0050236315p108, 0x1.5a2adfa0b4bc4p111 }, + { 0x1.71ce426a561d3p108, 0x1.4834877429b8fp111 }, + { 0x1.5dd4af79906a9p108, 0x1.37231085c7d9ap111 }, + { 0x1.4ae555af52cdfp108, 0x1.26eb9daed6f7ep111 }, + { 0x1.38f2e86f38216p108, 0x1.1783ceac2891p111 }, + { 0x1.27f0bd5d0e6b1p108, 0x1.08e1badf0fcedp111 }, + { 0x1.17d2c50b2bfafp108, 0x1.f5f7d88472604p110 }, + { 0x1.088d83f7e4069p108, 0x1.db92b5212fb8dp110 }, + { 0x1.f42c17ae0ebf6p107, 0x1.c282cd3957edap110 }, + { 0x1.d8c3ea48f2889p107, 0x1.aab7abace48dcp110 }, + { 0x1.beceb1f9f5b3dp107, 0x1.94219bfcb4928p110 }, + { 0x1.a6399674d366bp107, 0x1.7eb1a2075864ep110 }, + { 0x1.8ef2a9a18d857p107, 0x1.6a597219a93dap110 }, + { 0x1.78e8dcd2e6bfdp107, 0x1.570b69502f313p110 }, + { 0x1.640bf6745325ep107, 0x1.44ba864670882p110 }, + { 0x1.504c882a97424p107, 0x1.335a62115bce2p110 }, + { 0x1.3d9be56279ee9p107, 0x1.22df298214423p110 }, + { 0x1.2bec1a4917edbp107, 0x1.133d96ae7e0ddp110 }, + { 0x1.1b2fe32991d5cp107, 0x1.046aeabcfcdecp110 }, + { 0x1.0b5aa42bf5054p107, 0x1.ecb9cfe1d8642p109 }, + { 0x1.f8c0c2e2ce8dep106, 0x1.d21397ead99cbp109 }, + { 0x1.dc6b6f1384e18p106, 0x1.b8d094c86d374p109 }, + { 0x1.c19fa87de37fbp106, 0x1.a0df0f0c626dcp109 }, + { 0x1.a848df650bea7p106, 0x1.8a2e269750a39p109 }, + { 0x1.90538b942ea7cp106, 0x1.74adc8f4064d3p109 }, + { 0x1.79ad1fce5b3d8p106, 0x1.604ea819f007cp109 }, + { 0x1.6443fdcf0c327p106, 0x1.4d0231928c6f9p109 }, + { 0x1.50076ad55cc39p106, 0x1.3aba85fe22e2p109 }, + { 0x1.3ce784b411931p106, 0x1.296a70f414053p109 }, + { 0x1.2ad53760d7287p106, 0x1.1905613b3abf2p109 }, + { 0x1.19c232fd50b88p106, 0x1.097f6156f32c5p109 }, + { 0x1.09a0e254c75ep106, 0x1.f59a20caf6695p108 }, + { 0x1.f4c8c392fb944p105, 0x1.d9c73698fb1dcp108 }, + { 0x1.d800ed59bd026p105, 0x1.bf716c6168baep108 }, + { 0x1.bcd30dfbd611bp105, 0x1.a6852c6b58392p108 }, + { 0x1.a32923130213fp105, 0x1.8eefd70594a89p108 }, + { 0x1.8aee4cd06ec1bp105, 0x1.789fb715aae95p108 }, + { 0x1.740ebfab80eb4p105, 0x1.6383f726a8e04p108 }, + { 0x1.5e77b6bbd2127p105, 0x1.4f8c96f26a26ap108 }, + { 0x1.4a1766b6e5e8ap105, 0x1.3caa61607f92p108 }, + { 0x1.36dcf18a6465cp105, 0x1.2acee2f5ecdb8p108 }, + { 0x1.24b85a8bf0124p105, 0x1.19ec60b1242edp108 }, + { 0x1.139a7b37f8475p105, 0x1.09f5cf4dd2877p108 }, + { 0x1.0374f8792ca97p105, 0x1.f5bd95d8730d8p107 }, + { 0x1.e87470e4f4246p104, 0x1.d9371e2ff7c35p107 }, + { 0x1.cbbab18b73217p104, 0x1.be41de54d155ap107 }, + { 0x1.b0a44aa2f067ep104, 0x1.a4c89e08ef4f3p107 }, + { 0x1.971a1ec0f40c7p104, 0x1.8cb738399b12cp107 }, + { 0x1.7f064a8ba8323p104, 0x1.75fa8dbc84becp107 }, + { 0x1.685414c16188ep104, 0x1.608078a70dcbcp107 }, + { 0x1.52efdf060cd2p104, 0x1.4c37c0394d094p107 }, + { 0x1.3ec7176d784b5p104, 0x1.39100d5687bfep107 }, + { 0x1.2bc82ab9d2302p104, 0x1.26f9df8519bd7p107 }, + { 0x1.19e277461404p104, 0x1.15e6827001f18p107 }, + { 0x1.090640946d2d5p104, 0x1.05c803e4831c1p107 }, + { 0x1.f24946f22d5aep103, 0x1.ed22548cffd35p106 }, + { 0x1.d45f15b49b35ep103, 0x1.d06ad6ecdf971p106 }, + { 0x1.b83349fd05191p103, 0x1.b551c847fbc96p106 }, + { 0x1.9dacb2c432ef4p103, 0x1.9bc09f112b494p106 }, + { 0x1.84b37e1cbf8ebp103, 0x1.83a1ff0aa239dp106 }, + { 0x1.6d3126d74b6ccp103, 0x1.6ce1aa3fd7bddp106 }, + { 0x1.5710631158bffp103, 0x1.576c72b514859p106 }, + { 0x1.423d13a3b73e1p103, 0x1.43302cc4a0da8p106 }, + { 0x1.2ea43465e3995p103, 0x1.301ba221dc9bbp106 }, + { 0x1.1c33cd3c37addp103, 0x1.1e1e857adc568p106 }, + { 0x1.0adae3e73c2b5p103, 0x1.0d2966b1746f7p106 }, + { 0x1.f512dd15b73b7p102, 0x1.fa5b4f49cc6b2p105 }, + { 0x1.d6608dc942687p102, 0x1.dc3ae30b55c16p105 }, + { 0x1.b9823c51276e1p102, 0x1.bfd7555a3bd68p105 }, + { 0x1.9e5ce2f93dd76p102, 0x1.a517d9e61628ap105 }, + { 0x1.84d6fe15b6b93p102, 0x1.8be4f8f6c951fp105 }, + { 0x1.6cd87746bc76bp102, 0x1.74287ded49339p105 }, + { 0x1.564a91cd221fp102, 0x1.5dcd669f2cd34p105 }, + { 0x1.4117d7e2c667dp102, 0x1.48bfd38302871p105 }, + { 0x1.2d2c0909ebeb9p102, 0x1.34ecf8a3c124ap105 }, + { 0x1.1a7409475f2f9p102, 0x1.22430f521cbcfp105 }, + { 0x1.08ddd13bd35e7p102, 0x1.10b1488aeb235p105 }, + { 0x1.f0b0be22d18e8p101, 0x1.0027c00a263a6p105 }, + { 0x1.d1a75065a8c74p101, 0x1.e12ee004efc37p104 }, + { 0x1.b48117843c1c7p101, 0x1.c3e44ae32b16bp104 }, + { 0x1.99218b8ac7f8ep101, 0x1.a854ea14102a8p104 }, + { 0x1.7f6dc6010b4adp101, 0x1.8e6761569f45dp104 }, + { 0x1.674c6ae60d852p101, 0x1.7603bac345f65p104 }, + { 0x1.50a592e3c968ep101, 0x1.5f1353cdad001p104 }, + { 0x1.3b62b6aafb0c8p101, 0x1.4980cb3c80949p104 }, + { 0x1.276e9b681072fp101, 0x1.3537f00b6ad4dp104 }, + { 0x1.14b54042f445bp101, 0x1.2225b12bffc68p104 }, + { 0x1.0323ccdc1a3dcp101, 0x1.10380e1adb7e9p104 }, + { 0x1.e5510173b9a5p100, 0x1.febc107d5efaap103 }, + { 0x1.c6654733b86adp100, 0x1.df0f2a0ee6947p103 }, + { 0x1.a964ed354f984p100, 0x1.c14b2188bcee4p103 }, + { 0x1.8e324c651b064p100, 0x1.a553644f7f07dp103 }, + { 0x1.74b179d1eba81p100, 0x1.8b0cfce0579ep103 }, + { 0x1.5cc82d9070d95p100, 0x1.725e7c5dd20f7p103 }, + { 0x1.465daafca8b1dp100, 0x1.5b2fe547a134p103 }, + { 0x1.315aaa46df48ep100, 0x1.456a974e92e93p103 }, + { 0x1.1da9433aebbcfp100, 0x1.30f93c3699078p103 }, + { 0x1.0b34d93135fcp100, 0x1.1dc7b5b978cf8p103 }, + { 0x1.f3d41033c44ccp99, 0x1.0bc30c5d52f15p103 }, + { 0x1.d36d25268cd2bp99, 0x1.f5b2be65a0c7fp102 }, + { 0x1.b512a1fb1d8fcp99, 0x1.d5f3a8dea7357p102 }, + { 0x1.98a442fc4fc15p99, 0x1.b82915b03515bp102 }, + { 0x1.7e03b1cc6d738p99, 0x1.9c3517e789488p102 }, + { 0x1.651468e010b8ap99, 0x1.81fb7df06136ep102 }, + { 0x1.4dbb989001d84p99, 0x1.6961b8d641d06p102 }, + { 0x1.37e00dac4e8b5p99, 0x1.524ec4d916caep102 }, + { 0x1.236a197bf0b9ap99, 0x1.3cab1343d18d1p102 }, + { 0x1.10437b1569d7ep99, 0x1.2860757487a01p102 }, + { 0x1.fcae93fb7323cp98, 0x1.155a09065d4f7p102 }, + { 0x1.db23c3f816f92p98, 0x1.0384250e4c9fcp102 }, + { 0x1.bbc1a022c14d4p98, 0x1.e59890b926c78p101 }, + { 0x1.9e658108af2ep98, 0x1.c642116a8a9e3p101 }, + { 0x1.82eedbe410407p98, 0x1.a8e405e651ab6p101 }, + { 0x1.693f22ab61ce9p98, 0x1.8d5f98114f872p101 }, + { 0x1.5139a5f3661fbp98, 0x1.7397c5a66e307p101 }, + { 0x1.3ac3788a1b429p98, 0x1.5b71456c5a4c4p101 }, + { 0x1.25c354b26cb4ep98, 0x1.44d26de513197p101 }, + { 0x1.122182e9a270fp98, 0x1.2fa31d6371537p101 }, + { 0x1.ff8f84418d51p97, 0x1.1bcca373b7b43p101 }, + { 0x1.dd4262aac53e8p97, 0x1.0939ab853339fp101 }, + { 0x1.bd3474ec16ca5p97, 0x1.efac5187b2863p100 }, + { 0x1.9f40fd0082b72p97, 0x1.cf1e86235d0e7p100 }, + { 0x1.8345858c4438dp97, 0x1.b0a68a2128babp100 }, + { 0x1.6921be96b86b1p97, 0x1.9423165bc4444p100 }, + { 0x1.50b75c536f927p97, 0x1.7974e743dea3dp100 }, + { 0x1.39e9f7dcbe479p97, 0x1.607e9eacd105p100 }, + { 0x1.249ef1c3be817p97, 0x1.4924a74dec729p100 }, + { 0x1.10bd565b35393p97, 0x1.334d19e0c216p100 }, + { 0x1.fc5b8748842b2p96, 0x1.1edfa3c5f5ccap100 }, + { 0x1.d9b4a18a38642p96, 0x1.0bc56f1b54701p100 }, + { 0x1.b95cede6d524bp96, 0x1.f3d2185e047d9p99 }, + { 0x1.9b2df77a02225p96, 0x1.d26cb87945e87p99 }, + { 0x1.7f03b935e8e3ap96, 0x1.b334fac4b9f99p99 }, + { 0x1.64bc777824f0ep96, 0x1.96076f7918d1cp99 }, + { 0x1.4c389be9acb83p96, 0x1.7ac2d72fc2c63p99 }, + { 0x1.355a9387de78cp96, 0x1.614801550319ep99 }, + { 0x1.2006aeb6bc768p96, 0x1.4979ac8b28927p99 }, + { 0x1.0c23033e2a376p96, 0x1.333c68e2d0548p99 }, + { 0x1.f32ea02b55d23p95, 0x1.1e767bce37dd7p99 }, + { 0x1.d099c5c770f5ap95, 0x1.0b0fc5b6d05ap99 }, + { 0x1.b05cfe2e99435p95, 0x1.f1e3523b41d7dp98 }, + { 0x1.92508d0743fc9p95, 0x1.d00de6608effep98 }, + { 0x1.764f46cf19f9cp95, 0x1.b0778b7b3301bp98 }, + { 0x1.5c36679625a01p95, 0x1.92fb04ec0f6cfp98 }, + { 0x1.43e56c3e340a7p95, 0x1.77756ec9f78fap98 }, + { 0x1.2d3dee1869201p95, 0x1.5dc61922d5a06p98 }, + { 0x1.182380bd2f494p95, 0x1.45ce65699ff6dp98 }, + { 0x1.047b91fcb6491p95, 0x1.2f71a5f15997p98 }, + { 0x1.e45a9790460c1p94, 0x1.1a94ff571654fp98 }, + { 0x1.c242efeaca76p94, 0x1.071f4bbea09ecp98 }, + { 0x1.a284cb82c31cep94, 0x1.e9f1ff8ddd774p97 }, + { 0x1.84f7a1eb7f7f3p94, 0x1.c818223a202c7p97 }, + { 0x1.697595326d7dcp94, 0x1.a887bd2b4404dp97 }, + { 0x1.4fdb462549af1p94, 0x1.8b1a336c5eb6bp97 }, + { 0x1.3807ab51436a8p94, 0x1.6fab63324088ap97 }, + { 0x1.21dbea9108398p94, 0x1.56197e30205bap97 }, + { 0x1.0d3b35021d695p94, 0x1.3e44e45301b92p97 }, + { 0x1.f4154a787cc1bp93, 0x1.281000bfe4c3fp97 }, + { 0x1.d0623f4f4a28fp93, 0x1.135f28f2d50b4p97 }, + { 0x1.af2e69a26261p93, 0x1.00187dded5975p97 }, + { 0x1.904e0b3aa82a3p93, 0x1.dc479de0ef001p96 }, + { 0x1.73985278fa30ep93, 0x1.bad4fdad3caa1p96 }, + { 0x1.58e7298af87d9p93, 0x1.9baed3ed27ab8p96 }, + { 0x1.401708b7e64c6p93, 0x1.7ead9ce4285bbp96 }, + { 0x1.2906cb94eb40dp93, 0x1.63ac6b4edc88ep96 }, + { 0x1.139788f2dd663p93, 0x1.4a88be2a6390cp96 }, + { 0x1.ff58dab4f2a79p92, 0x1.332259185f1ap96 }, + { 0x1.da552fdd03043p92, 0x1.1d5b1f3793044p96 }, + { 0x1.b7f1f31b571b6p92, 0x1.0916f04b6e18bp96 }, + { 0x1.98006c2117e39p92, 0x1.ec77101de6926p95 }, + { 0x1.7a550f03b145bp92, 0x1.c960bf23153ep95 }, + { 0x1.5ec74662c5961p92, 0x1.a8bd20fc65ef7p95 }, + { 0x1.453141082302ap92, 0x1.8a61745ec7d1dp95 }, + { 0x1.2d6fc2c9e8bcp92, 0x1.6e25d0e756261p95 }, + { 0x1.1761f87a6dc3dp92, 0x1.53e4f7d1666cbp95 }, + { 0x1.02e94eb4ac8a5p92, 0x1.3b7c27a7ddb0ep95 }, + { 0x1.dfd296adef82ap91, 0x1.24caf2c32af14p95 }, + { 0x1.bc8ed301215ebp91, 0x1.0fb3186804d0fp95 }, + { 0x1.9bd5efd2c0f15p91, 0x1.f830c0bb41fd7p94 }, + { 0x1.7d79f2db2d4a5p91, 0x1.d3c0f1a91c846p94 }, + { 0x1.61500f5293f06p91, 0x1.b1e5acf351d87p94 }, + { 0x1.47306f04df3d6p91, 0x1.92712d259ce66p94 }, + { 0x1.2ef5ff0323b28p91, 0x1.7538c60a04476p94 }, + { 0x1.187e3fb74914dp91, 0x1.5a14b04b47879p94 }, + { 0x1.03a918225a966p91, 0x1.40dfd87456f4cp94 }, + { 0x1.e0b15822be4ep90, 0x1.2977b1172b9d5p94 }, + { 0x1.bce26a2fb7176p90, 0x1.13bc07e891491p94 }, + { 0x1.9bb1bc445c3c6p90, 0x1.ff1dbb4300811p93 }, + { 0x1.7cef42e9a617dp90, 0x1.d9a880f306bd8p93 }, + { 0x1.606e51e0a4963p90, 0x1.b6e45220b55ep93 }, + { 0x1.460560e841d79p90, 0x1.96a0b33f2c4dap93 }, + { 0x1.2d8dd47a40ad8p90, 0x1.78b07e9e924acp93 }, + { 0x1.16e3ca3d4393fp90, 0x1.5ce9ab1670dd2p93 }, + { 0x1.01e5e8edda47bp90, 0x1.4325167006bbp93 }, + { 0x1.dcea670907819p89, 0x1.2b3e53538ff3fp93 }, + { 0x1.b8e9bec48816dp89, 0x1.15137a7f44864p93 }, + { 0x1.97945aa1c9c35p89, 0x1.0084ff125639dp93 }, + { 0x1.78b88a4e7107bp89, 0x1.daeb0b7311ec7p92 }, + { 0x1.5c2827c986b62p89, 0x1.b7937d1c40c53p92 }, + { 0x1.41b858361b0fep89, 0x1.96d082f59ab06p92 }, + { 0x1.294150fb19119p89, 0x1.7872d9fa10aadp92 }, + { 0x1.129e20e732adcp89, 0x1.5c4e8e37bc7dp92 }, + { 0x1.fb58fa290d436p88, 0x1.423ac0df49a4p92 }, + { 0x1.d499229819bc6p88, 0x1.2a117230ad284p92 }, + { 0x1.b0c1a759f7739p88, 0x1.13af4f04f9998p92 }, + { 0x1.8f9bb6c075486p88, 0x1.fde703724e56p91 }, + { 0x1.70f4744735c2bp88, 0x1.d77f0c82e7641p91 }, + { 0x1.549cb0f7ef8e2p88, 0x1.b3ee02611d7ddp91 }, + { 0x1.3a68a8c1234e1p88, 0x1.92ff33023d5bdp91 }, + { 0x1.222fc469e8b8cp88, 0x1.7481a9e69f53fp91 }, + { 0x1.0bcc5fd30f1ddp88, 0x1.5847eda620959p91 }, + { 0x1.ee3728761897bp87, 0x1.3e27c1fcc74bdp91 }, + { 0x1.c7fa0c7e3bac7p87, 0x1.25f9ee0b923dcp91 }, + { 0x1.a4a56eb132a54p87, 0x1.0f9a0686532p91 }, + { 0x1.8401b5336a8ap87, 0x1.f5cc7718082bp90 }, + { 0x1.65db58e2358c1p87, 0x1.cf7e53d6a2ca5p90 }, + { 0x1.4a029a7ea7cd1p87, 0x1.ac0f5f3229372p90 }, + { 0x1.304b3d1961171p87, 0x1.8b498644847eap90 }, + { 0x1.188c45630dc53p87, 0x1.6cfa9bcca59dcp90 }, + { 0x1.029fbd8b92835p87, 0x1.50f411d4fd2cdp90 }, + { 0x1.dcc4fabf32f1cp86, 0x1.370ab8327af5ep90 }, + { 0x1.b767ecb334a7ep86, 0x1.1f167f88c6b6ep90 }, + { 0x1.94ec06c0ff29fp86, 0x1.08f24085d4597p90 }, + { 0x1.751977e5803d3p86, 0x1.e8f70e181d61ap89 }, + { 0x1.57bc950253825p86, 0x1.c324c20e337dcp89 }, + { 0x1.3ca58b816a87fp86, 0x1.a03261574b54ep89 }, + { 0x1.23a8197d2607ep86, 0x1.7fe903cdf5855p89 }, + { 0x1.0c9b4b0a6a16fp86, 0x1.6215c58da345p89 }, + { 0x1.eeb27891d2bb3p85, 0x1.46897d4b69fc6p89 }, + { 0x1.c77dbfc848866p85, 0x1.2d1877d731b7bp89 }, + { 0x1.a357936adf17bp85, 0x1.159a386b11517p89 }, + { 0x1.8203fa7992554p85, 0x1.ffd27ae9393cep88 }, + { 0x1.634b7f56b0a5cp85, 0x1.d7c593130dd0bp88 }, + { 0x1.46fada7e6a5fep85, 0x1.b2cd607c79bcfp88 }, + { 0x1.2ce2a3690576bp85, 0x1.90ae4d3405651p88 }, + { 0x1.14d707280e6cfp85, 0x1.71312dd1759e2p88 }, + { 0x1.fd5f08ad2b29ap84, 0x1.5422ef5d8949dp88 }, + { 0x1.d48d57f7718b7p84, 0x1.39544b0ecc957p88 }, + { 0x1.aef3ce0add578p84, 0x1.20997f73e73ddp88 }, + { 0x1.8c52800f939c8p84, 0x1.09ca0eaacd277p88 }, + { 0x1.6c6e61e57bf9bp84, 0x1.e9810295890ecp87 }, + { 0x1.4f10e8ebc44a9p84, 0x1.c2b45b5aa4a1dp87 }, + { 0x1.3407b59d72a5bp84, 0x1.9eee068fa7596p87 }, + { 0x1.1b2443858c0a1p84, 0x1.7df2b399c10a8p87 }, + { 0x1.043b9f1621ff3p84, 0x1.5f8b87a31bd85p87 }, + { 0x1.de4c41eb96b45p83, 0x1.4385c96e9a2d9p87 }, + { 0x1.b77e5cbd5d147p83, 0x1.29b2933ef4cbcp87 }, + { 0x1.93c9fc62bfb11p83, 0x1.11e68a6378f8ap87 }, + { 0x1.72f0c4c8e9bffp83, 0x1.f7f338086a86bp86 }, + { 0x1.54b92affb11afp83, 0x1.cf8d7d9ce040ap86 }, + { 0x1.38ee17b150182p83, 0x1.aa577251ae485p86 }, + { 0x1.1f5e908f70e0cp83, 0x1.8811d739efb5fp86 }, + { 0x1.07dd6833bb38p83, 0x1.68823e52970bep86 }, + { 0x1.e481e7f6ac4bcp82, 0x1.4b72ae68e8b4cp86 }, + { 0x1.bcc58edad5559p82, 0x1.30b14dbe876bcp86 }, + { 0x1.983ee9896d582p82, 0x1.181012ef8661p86 }, + { 0x1.76aca47764427p82, 0x1.01647ba798745p86 }, + { 0x1.57d287836bd3dp82, 0x1.d90e917701675p85 }, + { 0x1.3b79118c097a1p82, 0x1.b2a87e86d0c8ap85 }, + { 0x1.216d1b97279a9p82, 0x1.8f53dcb377293p85 }, + { 0x1.097f82fc04025p82, 0x1.6ed2f2515e933p85 }, + { 0x1.e709b415656dp81, 0x1.50ecc9ed47f19p85 }, + { 0x1.beaa3d6c15504p81, 0x1.356cd5ce7799ep85 }, + { 0x1.9996ed9b83967p81, 0x1.1c229a587ab78p85 }, + { 0x1.778be2bd9795bp81, 0x1.04e15ecc7f3f6p85 }, + { 0x1.584a99af8a842p81, 0x1.deffc7e6a6017p84 }, + { 0x1.3b99832cbefddp81, 0x1.b7b040832f31p84 }, + { 0x1.2143a112d0466p81, 0x1.938e021f36d76p84 }, + { 0x1.09182b326b229p81, 0x1.7258610b3b233p84 }, + { 0x1.e5d47637f5db5p80, 0x1.53d3bfc82a909p84 }, + { 0x1.bd20fcc3b76d7p80, 0x1.37c92babdc2fdp84 }, + { 0x1.97c9dda748fc7p80, 0x1.1e06010120f6ap84 }, + { 0x1.7589207e91ad1p80, 0x1.065b9616170d4p84 }, + { 0x1.561e669aa7fdbp80, 0x1.e13dd96b3753bp83 }, + { 0x1.394e7a2ac9fc7p80, 0x1.b950d32467392p83 }, + { 0x1.1ee2e61eccc99p80, 0x1.94a72263259a5p83 }, + { 0x1.06a996198f06fp80, 0x1.72fd93e036cdcp83 }, + { 0x1.e0e8fbad2703ep79, 0x1.54164576929abp83 }, + { 0x1.b8328ee330ae9p79, 0x1.37b83c521fe96p83 }, + { 0x1.92e21013a767p79, 0x1.1daf033182e96p83 }, + { 0x1.70aff489136ebp79, 0x1.05ca50205d26ap83 }, + { 0x1.515a7c77fab48p79, 0x1.dfbb6235639fap82 }, + { 0x1.34a53ce0bbb6fp79, 0x1.b7807e294781fp82 }, + { 0x1.1a58b2b09fdcbp79, 0x1.9298add70a734p82 }, + { 0x1.0241de6c31e5bp79, 0x1.70beaf9c7ffb6p82 }, + { 0x1.d863cf753825cp78, 0x1.51b2cd6709222p82 }, + { 0x1.affb906d0ae09p78, 0x1.353a6cf7f7fffp82 }, + { 0x1.8afbf9e9520c2p78, 0x1.1b1fa8cbe84a7p82 }, + { 0x1.691c7c768becep78, 0x1.0330f0fd69921p82 }, + { 0x1.4a1a79df39cdep78, 0x1.da81670f96f9bp81 }, + { 0x1.2db8ca9009091p78, 0x1.b24a16b4d09aap81 }, + { 0x1.13bf4cb384e4ap78, 0x1.8d6eeb6efdbd6p81 }, + { 0x1.f7f4f88751db4p77, 0x1.6ba91ac734786p81 }, + { 0x1.cc7626bced452p77, 0x1.4cb7966770ab5p81 }, + { 0x1.a4ab6470c1c5cp77, 0x1.305e9721d0981p81 }, + { 0x1.80451c2811052p77, 0x1.1667311fff70ap81 }, + { 0x1.5efa4d64f59f6p77, 0x1.fd3de10d62855p80 }, + { 0x1.40880373ed74p77, 0x1.d1aefbcd48d0cp80 }, + { 0x1.24b0d7368076ep77, 0x1.a9cc93c25aca9p80 }, + { 0x1.0b3c7b0d960fp77, 0x1.85487ee3ea735p80 }, + { 0x1.e7eea02e4ed88p76, 0x1.63daf8b4b1e0cp80 }, + { 0x1.bd6408059b696p76, 0x1.45421e69a6ca1p80 }, + { 0x1.96826d9e90341p76, 0x1.294175802d99ap80 }, + { 0x1.72fa4fa12d516p76, 0x1.0fa17bf41068fp80 }, + { 0x1.5282d2d5803fep76, 0x1.f05e82aae2bb9p79 }, + { 0x1.34d935f1be064p76, 0x1.c578101b29058p79 }, + { 0x1.19c050c56d0d7p76, 0x1.9e39dc5dd2f7cp79 }, + { 0x1.01001dd9c7ccep76, 0x1.7a553a728bbf2p79 }, + { 0x1.d4ca9b634ecbap75, 0x1.5982008db1304p79 }, + { 0x1.ab81c5c80cf39p75, 0x1.3b7e00422e51bp79 }, + { 0x1.85cfacb7477f2p75, 0x1.200c898d9ee3ep79 }, + { 0x1.6365862923eb9p75, 0x1.06f5f7eb65a56p79 }, + { 0x1.43fb317b5dc37p75, 0x1.e00e9148a1d25p78 }, + { 0x1.274ea96044bd7p75, 0x1.b623734024e92p78 }, + { 0x1.0d23817479c67p75, 0x1.8fd4e01891bf8p78 }, + { 0x1.ea84dd159259p74, 0x1.6cd44c7470d89p78 }, + { 0x1.bef1b1a12823ep74, 0x1.4cd9c04158cd7p78 }, + { 0x1.9730edfda64acp74, 0x1.2fa34bf5c8344p78 }, + { 0x1.72ede3b7eaa25p74, 0x1.14f4890ff2461p78 }, + { 0x1.51db1ec3a3087p74, 0x1.f92c49dfa4df5p77 }, + { 0x1.33b1c9d1576ecp74, 0x1.ccaaea71ab0dfp77 }, + { 0x1.18311f8a03acap74, 0x1.a40829f001197p77 }, + { 0x1.fe3bcf4629feap73, 0x1.7eef13b59e96cp77 }, + { 0x1.d083fda665164p73, 0x1.5d11e1a252bf5p77 }, + { 0x1.a6d7d18831888p73, 0x1.3e296303b2297p77 }, + { 0x1.80dcd6603df1bp73, 0x1.21f47009f43cep77 }, + { 0x1.5e4062d5b6a4ep73, 0x1.083768c5e4542p77 }, + { 0x1.3eb6ef47c2758p73, 0x1.e1777d831265fp76 }, + { 0x1.21fb7a81c5444p73, 0x1.b69f10b0191b5p76 }, + { 0x1.07cefb734d68bp73, 0x1.8f8a3a05b5b53p76 }, + { 0x1.dfefbdb19ac7ep72, 0x1.6be573c40c8e7p76 }, + { 0x1.b4831fb12344p72, 0x1.4b645ba991fdbp76 }, + { 0x1.8cf81557d20b6p72, 0x1.2dc119095729fp76 }, + { 0x1.68f6f0feb4755p72, 0x1.12bbcfa4d62dep76 }, + { 0x1.482fa78c40635p72, 0x1.f4343c7d504b9p75 }, + { 0x1.2a59289a484fbp72, 0x1.c74d4fe1e0e8bp75 }, + { 0x1.0f30c4d0be5cp72, 0x1.9e614ecbf4af6p75 }, + { 0x1.ecf3428c48d4fp71, 0x1.791716475420cp75 }, + { 0x1.bff86d9ec8499p71, 0x1.571d34563050ap75 }, + { 0x1.970bb87f4ae14p71, 0x1.3829407a207d8p75 }, + { 0x1.71d0b55b79b86p71, 0x1.1bf74244aed5ap75 }, + { 0x1.4ff315d036fbdp71, 0x1.024924c7520d1p75 }, + { 0x1.3125f6a3d257p71, 0x1.d5cc6ba567f29p74 }, + { 0x1.15233ae8815f2p71, 0x1.ab3560167ccaap74 }, + { 0x1.f755ea760487dp70, 0x1.846e9dda7a163p74 }, + { 0x1.c905bbd9ab5a6p70, 0x1.6121d7db32bddp74 }, + { 0x1.9eebaa0589b4ep70, 0x1.410047ead6894p74 }, + { 0x1.78a6de0f41b89p70, 0x1.23c2090cdde78p74 }, + { 0x1.55df1790f2f61p70, 0x1.09257fca001cp74 }, + { 0x1.3643ec463a3cfp70, 0x1.e1dd9ec677783p73 }, + { 0x1.198c18435598dp70, 0x1.b5ceb5a13221bp73 }, + { 0x1.fee9bab9f4e14p69, 0x1.8dbaa11de2037p73 }, + { 0x1.cf82e0eb6196bp69, 0x1.694680a9a3ee6p73 }, + { 0x1.a474e7029a919p69, 0x1.481f73b3778e8p73 }, + { 0x1.7d5af6513e2bep69, 0x1.29f9e7d8fd094p73 }, + { 0x1.59d93e1d8f57dp69, 0x1.0e90f64b5b103p73 }, + { 0x1.399c279e4699ap69, 0x1.eb4b9e47b58c9p72 }, + { 0x1.1c579bbca6885p69, 0x1.bdfe62f60dd7p72 }, + { 0x1.01c659160612dp69, 0x1.94d1de5c4576fp72 }, + { 0x1.d352b1ae2694p68, 0x1.6f66f6ab90c3cp72 }, + { 0x1.a78e8252c204dp68, 0x1.4d67050b31c2ap72 }, + { 0x1.7fd7c80f3410ep68, 0x1.2e8318008cf89p72 }, + { 0x1.5bcf92cc55d86p68, 0x1.1273463a1589bp72 }, + { 0x1.3b1f876b10da7p68, 0x1.f1ec20afad0e2p71 }, + { 0x1.1d791bb1324a1p68, 0x1.c39fa0d4a5a2bp71 }, + { 0x1.0294e37abcee8p68, 0x1.99946bf7e02a1p71 }, + { 0x1.d463db5fa3c13p67, 0x1.73679b24aeb9bp71 }, + { 0x1.a82a5f4047a5bp67, 0x1.50bf2558ab78fp71 }, + { 0x1.8011fb05fe09p67, 0x1.314916abfa1eap71 }, + { 0x1.5bb91decf8a58p67, 0x1.14bad9006f53bp71 }, + { 0x1.3ac71ce35c1d3p67, 0x1.f5a1196b5bb2ep70 }, + { 0x1.1ceb656955c59p67, 0x1.c698e001f6d3p70 }, + { 0x1.01dcc2acf7755p67, 0x1.9beca74b0f147p70 }, + { 0x1.d2b166911c178p66, 0x1.753637caac6d9p70 }, + { 0x1.a6459c5b11342p66, 0x1.5218993857afcp70 }, + { 0x1.7e086accc805dp66, 0x1.323f3f19cff3ep70 }, + { 0x1.59962aef547b3p66, 0x1.155d47fdb9c94p70 }, + { 0x1.3894608650edep66, 0x1.f6599b70323cap69 }, + { 0x1.1ab0e4d284f44p66, 0x1.c6dc8a4bb3ba6p69 }, + { 0x1.ff4248ebb8299p65, 0x1.9bcfd83a431e9p69 }, + { 0x1.ce42dd8e4fa23p65, 0x1.74ca889bbacd5p69 }, + { 0x1.a1e8aa1400997p65, 0x1.516d33e26c04p69 }, + { 0x1.79c430435a7fcp65, 0x1.31612a7ef535fp69 }, + { 0x1.557046eb39249p65, 0x1.1457ab75c2489p69 }, + { 0x1.349127b59b217p65, 0x1.f41259c9550cp68 }, + { 0x1.16d392dff5104p65, 0x1.c46969ca99a2ep68 }, + { 0x1.f7d80dc993f2fp64, 0x1.993e82b76e726p68 }, + { 0x1.c72c149cb214bp64, 0x1.72267ac1b25ap68 }, + { 0x1.9b270c24cc8fap64, 0x1.4ec0062aeeb78p68 }, + { 0x1.73585df7b6643p64, 0x1.2eb2d18a2081bp68 }, + { 0x1.4f59f9910367ep64, 0x1.11aeb0b11d1a1p68 }, + { 0x1.2ecf5b7f6abe3p64, 0x1.eed5c0bbf1061p67 }, + { 0x1.1164ab45aa235p64, 0x1.bf4ab21b4f3fp67 }, + { 0x1.ed9bdbc6f1b0ap63, 0x1.944462d4d5991p67 }, + { 0x1.bd8c96533b39bp63, 0x1.6d561de54f6a1p67 }, + { 0x1.921ec84d5860ep63, 0x1.4a1d472804fc8p67 }, + { 0x1.6ae172414cebap63, 0x1.2a406e25fcb44p67 }, + { 0x1.476e3b661be8cp63, 0x1.0d6e7662dda9dp67 }, + { 0x1.276873924f0b4p63, 0x1.e6bba6770e22dp66 }, + { 0x1.0a7c2c9322f59p63, 0x1.b797ab2ba22d2p66 }, + { 0x1.e0bad18c4e37dp62, 0x1.8cf813910fdcdp66 }, + { 0x1.b18eba0be4d24p62, 0x1.666f488db6e0ap66 }, + { 0x1.86f7884e1caadp62, 0x1.4399f7770045fp66 }, + { 0x1.608484d592328p62, 0x1.241e1ebbbf4ecp66 }, + { 0x1.3dcfaee52a8f5p62, 0x1.07aa30ce6a5ap66 }, + { 0x1.1e7cbac093f27p62, 0x1.dbe8969a24c6fp65 }, + { 0x1.023827dc88ed9p62, 0x1.ad7301258d788p65 }, + { 0x1.d16cd999791c3p61, 0x1.837a640fa9d3dp65 }, + { 0x1.a3666de0788bp61, 0x1.5d90f358d61f6p65 }, + { 0x1.79e17816df1e8p61, 0x1.3b5342f7be9cp65 }, + { 0x1.546e385224d1p61, 0x1.1c674ecd152d3p65 }, + { 0x1.32a7a483e977bp61, 0x1.007b997a0b531p65 }, + { 0x1.1432649c86c4dp61, 0x1.ce8cc007a6432p64 }, + { 0x1.f177ce0bd5836p60, 0x1.a109c0bccbc39p64 }, + { 0x1.bff3166bc36eep60, 0x1.77f5624913c3ap64 }, + { 0x1.934fc0975fb3p60, 0x1.52e251d5d3b1fp64 }, + { 0x1.6b13ebb9a5ad4p60, 0x1.316da780bc4d9p64 }, + { 0x1.46d17a80cc174p60, 0x1.133deb1d3526p64 }, + { 0x1.2624f3a0a887p60, 0x1.f00460b24acf8p63 }, + { 0x1.08b47d7733cb6p60, 0x1.bee2903d584f9p63 }, + { 0x1.dc5de496b181p59, 0x1.92920a7c80e26p63 }, + { 0x1.ac9615b3c9fd7p59, 0x1.6a9b25345c773p63 }, + { 0x1.818d3a356669ep59, 0x1.4691b26b9c82fp63 }, + { 0x1.5acbdab2ed713p59, 0x1.2613e9610f6d1p63 }, + { 0x1.37e61fd4c0fep59, 0x1.08c969adf0beap63 }, + { 0x1.187ab3d71db11p59, 0x1.dcc4ac4f59be5p62 }, + { 0x1.f8637ea4e52acp58, 0x1.ad2d0a9a18288p62 }, + { 0x1.c577fd709b099p58, 0x1.82498a7cc94b9p62 }, + { 0x1.97a3dc62119c8p58, 0x1.5ba462dee8a02p62 }, + { 0x1.6e66137bb7ccap58, 0x1.38d330d8806ap62 }, + { 0x1.494a3f6a9a70ep58, 0x1.1975e0627306cp62 }, + { 0x1.27e767bb79ea2p58, 0x1.fa6b5ee8f3088p61 }, + { 0x1.09dee32687729p58, 0x1.c78892308bd9p61 }, + { 0x1.ddb6ae2f39381p57, 0x1.99b5ec6741cb3p61 }, + { 0x1.ad1f9fba4b2abp57, 0x1.7073c400e10dcp61 }, + { 0x1.816dde4c11ca3p57, 0x1.4b4ee0b3a84d6p61 }, + { 0x1.5a245d5e5289cp57, 0x1.29df4862ac231p61 }, + { 0x1.36d26a686daafp57, 0x1.0bc7294e0cbafp61 }, + { 0x1.171277cbbce9cp57, 0x1.e163bd8df864p60 }, + { 0x1.f5120b45c00e6p56, 0x1.b0a61bce91993p60 }, + { 0x1.c1c74b30d0bbp56, 0x1.84cbb00f925fp60 }, + { 0x1.93b02e5cf0324p56, 0x1.5d5841ce6cb73p60 }, + { 0x1.6a46f43f3118cp56, 0x1.39dbcd485dd07p60 }, + { 0x1.45132973bb79bp56, 0x1.19f153b38a108p60 }, + { 0x1.23a85891dc72bp56, 0x1.fa7b9159fc471p59 }, + { 0x1.05a4dba466c4ep56, 0x1.c6de3429e31fap59 }, + { 0x1.d561964307dc4p55, 0x1.98769faac8a1bp59 }, + { 0x1.a4fa0f13737e8p55, 0x1.6ebf82977acfp59 }, + { 0x1.7984b636ad1bep55, 0x1.4940bc89fa5aap59 }, + { 0x1.5281628cb373ap55, 0x1.278e135bcf0a4p59 }, + { 0x1.2f7cc38bc628dp55, 0x1.0946088b6f8edp59 }, + { 0x1.100f1aef8eaf5p55, 0x1.dc21972b9e9f4p58 }, + { 0x1.e7b62ce66acdep54, 0x1.ab3e8cfada51ap58 }, + { 0x1.b5198cf325114p54, 0x1.7f5483f729c27p58 }, + { 0x1.87b15da6677afp54, 0x1.57e33e2b1c6dap58 }, + { 0x1.5ef5de2e68985p54, 0x1.3477480d89e25p58 }, + { 0x1.3a6d00852a688p54, 0x1.14a8b54629fb2p58 }, + { 0x1.19a90b14f53afp54, 0x1.f033fa073d52p57 }, + { 0x1.f88eba04114cbp53, 0x1.bcede5acc0d4p57 }, + { 0x1.c3dea36b87937p53, 0x1.8ee7b29d0b081p57 }, + { 0x1.94a28136fa731p53, 0x1.659917bbb6632p57 }, + { 0x1.6a4b2c9663fa1p53, 0x1.40877b79cd868p57 }, + { 0x1.44580945b8452p53, 0x1.1f44979177348p57 }, + { 0x1.22558f1aa9f03p53, 0x1.016d3f035816p57 }, + { 0x1.03dbf8db89298p53, 0x1.cd508600d0ba8p56 }, + { 0x1.d11c2965639f6p52, 0x1.9d4ae77a21604p56 }, + { 0x1.a03065db54a4bp52, 0x1.723974e9529d8p56 }, + { 0x1.745e6013d8cf3p52, 0x1.4b9a944f57915p56 }, + { 0x1.4d1f2eb8531p52, 0x1.28f9c9b769ee3p56 }, + { 0x1.29f9b7c4f56dfp52, 0x1.09ee66b6e99e9p56 }, + { 0x1.0a814a1dfc5edp52, 0x1.dc34b6999ff72p55 }, + { 0x1.dca8b63e38fa9p51, 0x1.aa5249b4cca57p55 }, + { 0x1.aa36c9242f8bcp51, 0x1.7d9db080918bap55 }, + { 0x1.7d0fbfa6c3c19p51, 0x1.558e88e8945efp55 }, + { 0x1.54a6b679dd96fp51, 0x1.31aa564e92066p55 }, + { 0x1.307d4e71272d7p51, 0x1.11831a9c3763dp55 }, + { 0x1.1022313b11381p51, 0x1.e96c265c21fbfp54 }, + { 0x1.e65f78e13edcdp50, 0x1.b5d52c19374fep54 }, + { 0x1.b2959e487c93fp50, 0x1.87a2188252d5fp54 }, + { 0x1.84436cf62b6f8p50, 0x1.5e440cc8caaf9p54 }, + { 0x1.5ad66c67f3f63p50, 0x1.393ad199301dep54 }, + { 0x1.35cb549c616ebp50, 0x1.18135a0647102p54 }, + { 0x1.14ac7e9322a1ap50, 0x1.f4ccd98eab06bp53 }, + { 0x1.ee20fae75a2c5p49, 0x1.bfaedff2748c1p53 }, + { 0x1.b931b883c77f2p49, 0x1.9026a7e3c9538p53 }, + { 0x1.89e1f8e1d4be6p49, 0x1.659f3419269eep53 }, + { 0x1.5f9a24050e89fp49, 0x1.3f92e9472ca4cp53 }, + { 0x1.39d2746cbe57fp49, 0x1.1d89fb6602df9p53 }, + { 0x1.18115431b6c4ap49, 0x1.fe32077e095c4p52 }, + { 0x1.f3d3ca19edf64p48, 0x1.c7bf775863df5p52 }, + { 0x1.bdf55dd9bdcep48, 0x1.970fb0b5580dcp52 }, + { 0x1.8dd8e25d2255dp48, 0x1.6b88087e4af9fp52 }, + { 0x1.62e225ebca19p48, 0x1.449de67f2c6b2p52 }, + { 0x1.3c855ef212badp48, 0x1.21d51dc348d4dp52 }, + { 0x1.1a4576cd5cddcp48, 0x1.02be7023a443ep52 }, + { 0x1.f765035c713d8p47, 0x1.cdec7155697e1p51 }, + { 0x1.c0d0bdeb46ae2p47, 0x1.9c4671c1a6e3cp51 }, + { 0x1.901afbd3819bep47, 0x1.6feb0af26f865p51 }, + { 0x1.64a386137b955p47, 0x1.484b1e63b3be4p51 }, + { 0x1.3ddb15521ce49p47, 0x1.24e68a1458bd7p51 }, + { 0x1.1b418ba2217c6p47, 0x1.054a9a7c2f05ap51 }, + { 0x1.f8c8bad8e2a2p46, 0x1.d2214ad33ca5ep50 }, + { 0x1.c1ba4950b8f4fp46, 0x1.9fb9933adac68p50 }, + { 0x1.90a0b40dd690cp46, 0x1.72b99eccc462ep50 }, + { 0x1.64d860502b279p46, 0x1.4a8e4dbe3539cp50 }, + { 0x1.3dcf1aadc099dp46, 0x1.26b4018ef81f7p50 }, + { 0x1.1b02414a73357p46, 0x1.06b4fe82cc6aep50 }, + { 0x1.f7fa3e4bec2aep45, 0x1.d44feffb34893p49 }, + { 0x1.c0aee6d6b1406p45, 0x1.a15d86bb23572p49 }, + { 0x1.8f684065398bfp45, 0x1.73ea5ac0d71a9p49 }, + { 0x1.637ff9397e989p45, 0x1.4b5fdd0f567fap49 }, + { 0x1.3c618d3c706ebp45, 0x1.2737769828878p49 }, + { 0x1.1988625955723p45, 0x1.06f8da87263cep49 }, + { 0x1.f4fc2f6d50e41p44, 0x1.d4710a9e149edp48 }, + { 0x1.bdb204ff1cda3p44, 0x1.a12cc7b1bf616p48 }, + { 0x1.8c75a6fa17116p44, 0x1.73793d6253bd7p48 }, + { 0x1.609ec277b8703p44, 0x1.4abd0af44c7f8p48 }, + { 0x1.399725d96eb63p44, 0x1.266f2e981ccfbp48 }, + { 0x1.16d8d1241b86bp44, 0x1.06154a07d21a2p48 }, + { 0x1.efd875a51d28dp43, 0x1.d2842b40e25fp47 }, + { 0x1.b8cd873c4de72p43, 0x1.9f27fa465d061p47 }, + { 0x1.87d2a89e5ac65p43, 0x1.7167c3937ded9p47 }, + { 0x1.5c3e42539c769p43, 0x1.48a7fb96552cap47 }, + { 0x1.35791e04cd29fp43, 0x1.245dcbaa25b1bp47 }, + { 0x1.12fc6cdafd10dp43, 0x1.040d4ab2de626p47 }, + { 0x1.e8a0077a1ed47p42, 0x1.ce8fcb8dadc2cp46 }, + { 0x1.b2118f75a4eb7p42, 0x1.9b55e7c11d9e6p46 }, + { 0x1.818e8b1c2616fp42, 0x1.6dbce02ec5c77p46 }, + { 0x1.566cdf4525ebp42, 0x1.4527acab6dfebp46 }, + { 0x1.3014fd204bc71p42, 0x1.210a3ddcb4706p46 }, + { 0x1.0dffe0bfc0c74p42, 0x1.00e7aba6527c9p46 }, + { 0x1.df6a8d5e14f11p41, 0x1.c8a12a152d814p45 }, + { 0x1.a9942579915cdp41, 0x1.95c35893651c9p45 }, + { 0x1.79bdc576e403ap41, 0x1.6884d52cc9914p45 }, + { 0x1.4f3d9114d799bp41, 0x1.4047ce663f641p45 }, + { 0x1.297c4e6eb62fcp41, 0x1.1c7f9c74f3e7cp45 }, + { 0x1.07f35ef1a4fcp41, 0x1.f95dcee779f74p44 }, + { 0x1.d455e0a3b0d94p40, 0x1.c0cc007cc808ep44 }, + { 0x1.9f70bf04a77cep40, 0x1.8e82cd2a6133cp44 }, + { 0x1.707990a8defefp40, 0x1.61d0ef76712e4p44 }, + { 0x1.46c779ebb14aep40, 0x1.3a1882865d26ep44 }, + { 0x1.21c4420bc9879p40, 0x1.16cce86450b2p44 }, + { 0x1.00ea48df1e7fbp40, 0x1.eee1d41e1e516p43 }, + { 0x1.c7856a7693627p39, 0x1.b72a1658393d4p43 }, + { 0x1.93c7abef59a2cp39, 0x1.85ac17b553c4fp43 }, + { 0x1.65df602b1e0ffp39, 0x1.59b72775450f3p43 }, + { 0x1.3d256a5ee461dp39, 0x1.32ae03812fcp43 }, + { 0x1.19053bac5f645p39, 0x1.1004b9cd4bae6p43 }, + { 0x1.f1f58fe66e142p38, 0x1.e27d88d5289bfp42 }, + { 0x1.b9216793da422p38, 0x1.abdab3fb224cep42 }, + { 0x1.86bd6adace04ep38, 0x1.7b5bd9f52a89ep42 }, + { 0x1.5a104640aeb74p38, 0x1.5051a941eb13p42 }, + { 0x1.32755417b50ddp38, 0x1.2a20366f6a0dep42 }, + { 0x1.0f5a5274f5c45p38, 0x1.083cdb1163405p42 }, + { 0x1.e07ab300dc4b9p37, 0x1.d458a013d18b4p41 }, + { 0x1.a956163a49613p37, 0x1.9f01f97b2e043p41 }, + { 0x1.7879eb52380edp37, 0x1.6fb2eaf7d8102p41 }, + { 0x1.4d30488394e18p37, 0x1.45be480207b14p41 }, + { 0x1.26d7af2869fc5p37, 0x1.208a2b041836ep41 }, + { 0x1.04e0c593552f5p37, 0x1.ff1ba8cbc9c8dp40 }, + { 0x1.cd98a274acae3p36, 0x1.c49f8a8ec4aebp40 }, + { 0x1.9852d44d7528bp36, 0x1.90c81ede57558p40 }, + { 0x1.6927c2c3e497p36, 0x1.62d5a948b6358p40 }, + { 0x1.3f65a98c177c9p36, 0x1.3a1de0952fd2bp40 }, + { 0x1.1a6ed66936eeap36, 0x1.16098d4b94692p40 }, + { 0x1.f36ed3084aa81p35, 0x1.ec24d6a8bc072p39 }, + { 0x1.b986ab7ebdd54p35, 0x1.b3828ebcc128bp39 }, + { 0x1.864933f3c0573p35, 0x1.8158a3038115ep39 }, + { 0x1.58f359f0c4e8fp35, 0x1.54eb3e9a3e72bp39 }, + { 0x1.30d82cb8a968cp35, 0x1.2d93b0174f61ap39 }, + { 0x1.0d5e5f59de7c1p35, 0x1.0abe0d45fd5c2p39 }, + { 0x1.dbfc240ab5f81p34, 0x1.d7ce33a39bd89p38 }, + { 0x1.a47db588b15cfp34, 0x1.a134d30d655e4p38 }, + { 0x1.736c0d0a31187p34, 0x1.70e16f315ef4p38 }, + { 0x1.480a1879e8f57p34, 0x1.461cda38e2783p38 }, + { 0x1.21b0591ce1cfdp34, 0x1.2044a2faebb7bp38 }, + { 0x1.ff94e3fca1752p33, 0x1.fd91813f8cc8cp37 }, + { 0x1.c3a9f9558ffap33, 0x1.c2530177987fep37 }, + { 0x1.8eb738c76b2f2p33, 0x1.8deb61106f334p37 }, + { 0x1.5fee91a43fef1p33, 0x1.5f91f55e86346p37 }, + { 0x1.3699940a6a811p33, 0x1.3694e7b13691bp37 }, + { 0x1.1216c07263dep33, 0x1.1256a18de488bp37 }, + { 0x1.e3ae49fef5535p32, 0x1.e49705a5ebd5fp36 }, + { 0x1.aab87fb8e4441p32, 0x1.abefb3186e784p36 }, + { 0x1.786c3dca158c4p32, 0x1.79dc285401b7dp36 }, + { 0x1.4c036b7451223p32, 0x1.4d9a4f359ba1ep36 }, + { 0x1.24cec8453db03p32, 0x1.267e46fd85893p36 }, + { 0x1.02334e92993b9p32, 0x1.03efdea0a0506p36 }, + { 0x1.c74fc41217dfbp31, 0x1.cad0afbb569b1p35 }, + { 0x1.9166837399532p31, 0x1.94e0d5e7a8744p35 }, + { 0x1.61d46c11dd916p31, 0x1.653d077d9eefp35 }, + { 0x1.37dbe7711fcd4p31, 0x1.3b2a639494566p35 }, + { 0x1.12d55c1e73c65p31, 0x1.16038b4af0a0ep35 }, + { 0x1.e4594b115943bp30, 0x1.ea6c598920c48p34 }, + { 0x1.aabdabdb93484p30, 0x1.b081aaf25ade1p34 }, + { 0x1.77f073eb945dfp30, 0x1.7d62079a4e4a6p34 }, + { 0x1.4b252d0bc8bebp30, 0x1.5042e1a8664edp34 }, + { 0x1.23a7345c57ccap30, 0x1.287117d29a9e6p34 }, + { 0x1.00d6f8a57f06ep30, 0x1.054e44f8ee735p34 }, + { 0x1.c44f136cf3bd8p29, 0x1.cc9cbc5fe04a8p33 }, + { 0x1.8e38df2790b7ap29, 0x1.95eb2cb828067p33 }, + { 0x1.5e8f828661e21p29, 0x1.65acfefcd0029p33 }, + { 0x1.3490e7e2bc31cp29, 0x1.3b20c56ad84f5p33 }, + { 0x1.0f91b7ff9bb2ap29, 0x1.159b917beb87ap33 }, + { 0x1.ddf56913a541ep28, 0x1.e90cb5cac7057p32 }, + { 0x1.a48cc1b8a7bc7p28, 0x1.aeb7659e5f7efp32 }, + { 0x1.71fde01e2ca8cp28, 0x1.7b4b752e86e5fp32 }, + { 0x1.4578e0b906b32p28, 0x1.4df8ace15322ep32 }, + { 0x1.1e4659a2a2156p28, 0x1.26072a17961ap32 }, + { 0x1.f788fc218597bp27, 0x1.02d48c75e7d9bp32 }, + { 0x1.bac92daac0b9dp27, 0x1.c7a2ecd5f05ap31 }, + { 0x1.85518c3484796p27, 0x1.90feaede7f2aep31 }, + { 0x1.56441b55bfff1p27, 0x1.60dcef1cedc3ap31 }, + { 0x1.2cdd203ab43a1p27, 0x1.36787980e7387p31 }, + { 0x1.08700c199ad4fp27, 0x1.112346e13dd7ep31 }, + { 0x1.d0c9857c390f3p26, 0x1.e087915129a98p30 }, + { 0x1.986a650394095p26, 0x1.a6a5096da5b7dp30 }, + { 0x1.66d6688315ad6p26, 0x1.73aff07c7874ep30 }, + { 0x1.3b3d55ebd8547p26, 0x1.46d572e10e216p30 }, + { 0x1.14e7b714e7093p26, 0x1.1f5ba17e5a90bp30 }, + { 0x1.e667d9a8bcd9ep25, 0x1.f93d0d186fbcdp29 }, + { 0x1.ab2733e383ad8p25, 0x1.bc1b22cec72bp29 }, + { 0x1.7712b76c8c7f6p25, 0x1.86529e9df069cp29 }, + { 0x1.494d8e1d4fc61p25, 0x1.5702d052bf73ap29 }, + { 0x1.2115447c6627dp25, 0x1.2d65aee08874cp29 }, + { 0x1.fb7d503fc65c8p24, 0x1.08ccb49580d43p29 }, + { 0x1.bd660913b938cp24, 0x1.d13c32a98512bp28 }, + { 0x1.86db66e158524p24, 0x1.98a4bfd5a5fadp28 }, + { 0x1.56f3ed5aa4222p24, 0x1.66e459a7794f4p28 }, + { 0x1.2ce2265a96befp24, 0x1.3b28bbce3c1c6p28 }, + { 0x1.07f14a8d0c116p24, 0x1.14b8b6b67144ep28 }, + { 0x1.cf049ebedf60dp23, 0x1.e5e26dbef0e28p27 }, + { 0x1.96129ca292f7ep23, 0x1.aa854b5c4f131p27 }, + { 0x1.6416763f6b3bcp23, 0x1.765d329106241p27 }, + { 0x1.3837bf030f4a8p23, 0x1.488b9479ee1c4p27 }, + { 0x1.11b82880134f9p23, 0x1.204c8d940530bp27 }, + { 0x1.dfe0c1b8af1f3p22, 0x1.f9e77238e0031p26 }, + { 0x1.a49aa1651cfcap22, 0x1.bbd2c8fd7e193p26 }, + { 0x1.709b5a3a79128p22, 0x1.85502f16a0f8dp26 }, + { 0x1.42ffa7e9ace3fp22, 0x1.5574ceffe3945p26 }, + { 0x1.1affd2eccd616p22, 0x1.2b72182c97af5p26 }, + { 0x1.efd8be43ac9a9p21, 0x1.06925da53a0fcp26 }, + { 0x1.b2564005de7e5p21, 0x1.cc6bb6d71090dp25 }, + { 0x1.7c694cd2b4ffdp21, 0x1.93a02d0c97221p25 }, + { 0x1.4d23fa69bd814p21, 0x1.61cb1a027e057p25 }, + { 0x1.23b556e6e918ep21, 0x1.361358dd1f243p25 }, + { 0x1.fecbcf04dca9p20, 0x1.0fba0d2660d89p25 }, + { 0x1.bf29264dcdc82p20, 0x1.dc2ef387bd0ep24 }, + { 0x1.8767d7fc43eb6p20, 0x1.a130711aadcdap24 }, + { 0x1.568f9937abc79p20, 0x1.6d758e1ac9659p24 }, + { 0x1.2bc67d8c20136p20, 0x1.401abca024479p24 }, + { 0x1.064d4616b0094p20, 0x1.185819a7f8c6ap24 }, + { 0x1.caf8458ad2a12p19, 0x1.eafc2b00a99b1p23 }, + { 0x1.917faff93e54p19, 0x1.ade505ba61e89p23 }, + { 0x1.5f2e79283b1cap19, 0x1.785c00b5cb27ep23 }, + { 0x1.33220b1da4f59p19, 0x1.4973634932c1ap23 }, + { 0x1.0c93ac678b0ccp19, 0x1.205a7d78be568p23 }, + { 0x1.d5aa313452daep18, 0x1.f8b4440d68221p22 }, + { 0x1.9a9b05368c88bp18, 0x1.b9a31a7b9868cp22 }, + { 0x1.66ede7f0c2d55p18, 0x1.826756e1a42e2p22 }, + { 0x1.39b7fc18e5891p18, 0x1.5209676e4b424p22 }, + { 0x1.122b662569616p18, 0x1.27b019965e362p22 }, + { 0x1.df2779ceabfc8p17, 0x1.029ce648133fdp22 }, + { 0x1.a2a5d2945d2b7p17, 0x1.c45161cd95fe8p21 }, + { 0x1.6dbccf848794ap17, 0x1.8b81d680cdfc5p21 }, + { 0x1.3f79bf21caa96p17, 0x1.59ca24a7521ddp21 }, + { 0x1.17080ae674896p17, 0x1.2e48f266999cfp21 }, + { 0x1.e75b024885f54p16, 0x1.0838b13324d03p21 }, + { 0x1.a98e26924c6c8p16, 0x1.cdd86b83e679dp20 }, + { 0x1.738bf4bc8d296p16, 0x1.93977456406ddp20 }, + { 0x1.445a6a9a273c6p16, 0x1.60a47aca18e96p20 }, + { 0x1.1b1eabeffc3a5p16, 0x1.341669953fe1cp20 }, + { 0x1.ee324e1fde417p15, 0x1.0d210b765b3d6p20 }, + { 0x1.af4465e9c5668p15, 0x1.d622fa53c02cep19 }, + { 0x1.784e3008fb46bp15, 0x1.9a961d6383ef7p19 }, + { 0x1.484eecd2f1383p15, 0x1.66890cd0bf55fp19 }, + { 0x1.1e65fd1ef2701p15, 0x1.390b73f2a4fbp19 }, + { 0x1.f39dc6baaccd7p14, 0x1.114ae59581395p19 }, + { 0x1.b3bb863d26278p14, 0x1.dd1e5296953a3p18 }, + { 0x1.7bf89f052b591p14, 0x1.a06dfa21b6c59p18 }, + { 0x1.4b4e35dbe0cddp14, 0x1.6b6a7a27c9005p18 }, + { 0x1.20d6781986167p14, 0x1.3d1cca3d4f6d8p18 }, + { 0x1.f790f6877f51ep13, 0x1.14acc164c64fep18 }, + { 0x1.b6e93fa7299b3p13, 0x1.e2ba80b9c3a1bp17 }, + { 0x1.7e82cde922833p13, 0x1.a511aa3827999p17 }, + { 0x1.4d515a14a6132p13, 0x1.6f3d9139319edp17 }, + { 0x1.226a790f97768p13, 0x1.404113d7d18e6p17 }, + { 0x1.fa02b8ac73416p12, 0x1.173ed60fcd6fap17 }, + { 0x1.b8c634233722p12, 0x1.e6ea95e92c624p16 }, + { 0x1.7fe6d7fbcef2cp12, 0x1.a8767775dd309p16 }, + { 0x1.4e53acc7531b1p12, 0x1.71f97a2983044p16 }, + { 0x1.231e547065724p12, 0x1.42710a88aab19p16 }, + { 0x1.faed5c4559717p11, 0x1.18fb2ded8ebb1p16 }, + { 0x1.b94e0bfb59934p11, 0x1.e9a4d9b21386ep15 }, + { 0x1.80217e57d8a3fp11, 0x1.aa947efe69879p15 }, + { 0x1.4e52d23cf50bp11, 0x1.7397d8e2bd385p15 }, + { 0x1.22f0652094ae6p11, 0x1.43a79684f6ef6p15 }, + { 0x1.fa4eba730bf6p10, 0x1.19ddbd8138a9p15 }, + { 0x1.b87f86a26fad7p10, 0x1.eae2ef93df996p14 }, + { 0x1.7f323487ff94ap10, 0x1.ab66cfccafb75p14 }, + { 0x1.4d4ec8ea8ee67p10, 0x1.7414e5b5ca43cp14 }, + { 0x1.21e112e39bf18p10, 0x1.43e1e22ebfdb4p14 }, + { 0x1.f8283ec45f117p9, 0x1.19e4732be2ffp14 }, + { 0x1.b65c7f9f1fbedp9, 0x1.eaa1efb3b003ep13 }, + { 0x1.7d1b22b6810f6p9, 0x1.aaeb7de6855e2p13 }, + { 0x1.4b49e984886ep9, 0x1.736f7c0d13f06p13 }, + { 0x1.1ff2d0d5a2649p9, 0x1.431f651be2ff4p13 }, + { 0x1.f47ee1cab73ddp8, 0x1.190f3f39e9af4p13 }, + { 0x1.b2e9e76c8d9f9p8, 0x1.e8e2722ca46cfp12 }, + { 0x1.79e11d635b9a7p8, 0x1.a923a9d8d5019p12 }, + { 0x1.4848ddf7dfffep8, 0x1.71a91ee04e82cp12 }, + { 0x1.1d2a13fdd2709p8, 0x1.4161e6298ed3ap12 }, + { 0x1.ef5b15f73200ap7, 0x1.176014201ab17p12 }, + { 0x1.ae2fb07705cc3p7, 0x1.e5a88cbf394e4p11 }, + { 0x1.758b92cdfdc64p7, 0x1.a6137c537bf6dp11 }, + { 0x1.44528f79b1b51p7, 0x1.6ec5f2d1367f4p11 }, + { 0x1.198d422be3f8cp7, 0x1.3ead7491061afp11 }, + { 0x1.e8c8a7276c93p6, 0x1.14dadee76975ap11 }, + { 0x1.a838b09afcf62p6, 0x1.e0fbc2ec572b9p10 }, + { 0x1.70246e766d2f3p6, 0x1.a1c215fcd0beap10 }, + { 0x1.3f700c0d99876p6, 0x1.6accae115453ep10 }, + { 0x1.1524997d01ap6, 0x1.3b08582357e32p10 }, + { 0x1.e0d68d9047f7ap5, 0x1.118577f06b2f2p10 }, + { 0x1.a11277ca2bd3fp5, 0x1.dae6e8d292a1ep9 }, + { 0x1.69b7f34ec048ep5, 0x1.9c3973d4c9b08p9 }, + { 0x1.39ac6410ceb63p5, 0x1.65c67e684d1e6p9 }, + { 0x1.0ffa110b113fp5, 0x1.367af901b137p9 }, + { 0x1.d796b4f7aaf7fp4, 0x1.0d678c614f535p9 }, + { 0x1.98cd1cb38dccp4, 0x1.d377f96b9fd62p8 }, + { 0x1.62548d6675835p4, 0x1.958648bd6035p8 }, + { 0x1.331480815e7cdp4, 0x1.5fbee5e7590f4p8 }, + { 0x1.0a19336cc73a1p4, 0x1.310fbf558eca2p8 }, + { 0x1.cd1db96a6c6efp3, 0x1.088a80b837328p8 }, + { 0x1.8f7b007e1de49p3, 0x1.cabfe10b3371ap7 }, + { 0x1.5a0a9c047e3c7p3, 0x1.8db7ccf7600f4p7 }, + { 0x1.2bb6f2dd8e254p3, 0x1.58c38f07b7c3bp7 }, + { 0x1.038ef3cbdc1c7p3, 0x1.2ad2ebb6268bdp7 }, + { 0x1.c1829acfb62b3p2, 0x1.02f94d1fb1ba4p7 }, + { 0x1.85308ad209551p2, 0x1.c0d23d3daadadp6 }, + { 0x1.50ec3549a202dp2, 0x1.84df8496cc3aep6 }, + { 0x1.23a3bf963c1ebp2, 0x1.50e4191e1b76cp6 }, + { 0x1.f8d2fce0ebb41p1, 0x1.23d2690dc7344p6 }, + { 0x1.b4de68e608347p1, 0x1.f980a88588961p5 }, + { 0x1.7a03df8f9f479p1, 0x1.b5c5135a44acbp5 }, + { 0x1.470ce4924af72p1, 0x1.7b10fe1f0aeaap5 }, + { 0x1.1aec242758b4fp1, 0x1.4831de32e25bdp5 }, + { 0x1.e9700b697ec96p0, 0x1.1c1d98f1b1f71p5 }, + { 0x1.a74be9568f922p0, 0x1.ebda6af103d07p4 }, + { 0x1.6e0c8fadbb05p0, 0x1.a9b07f491a273p4 }, + { 0x1.3c8164e42f29cp0, 0x1.70618a9c019dap4 }, + { 0x1.11a259faba91ep0, 0x1.3ebfb36da371bp4 }, + { 0x1.d91518c2acaf6p-1, 0x1.13c51b7852ecp4 }, + { 0x1.98e739a118b5ep-1, 0x1.dd1d36683753bp3 }, + { 0x1.616346ca3be0ep-1, 0x1.9cae5c1f5de61p3 }, + { 0x1.315f58c13df9cp-1, 0x1.64e7f0a95542fp3 }, + { 0x1.07d957435b8c4p-1, 0x1.34a1a5595e9cbp3 }, + { 0x1.c7e35cf4db634p-2, 0x1.0ada93ac2688ep3 }, + { 0x1.89cd6ead31b71p-2, 0x1.cd680d6a376d2p2 }, + { 0x1.542176fe1c2b2p-2, 0x1.8ed9e84be9bacp2 }, + { 0x1.25bd00bd97eddp-2, 0x1.58bc1beb8e117p2 }, + { 0x1.fb491e02b7c15p-3, 0x1.29ecb15514182p2 }, + { 0x1.b5fcd30c7e1f6p-3, 0x1.017069c4b54cfp2 }, + { 0x1.7a1c33cc1922bp-3, 0x1.bcdb33f7b88f9p1 }, + { 0x1.46610483f2395p-3, 0x1.804f671a7a35cp1 }, + { 0x1.19b0f23241b88p-3, 0x1.4bf6ca87a4707p1 }, + { 0x1.e62f62b4555dcp-4, 0x1.1eb67d8a75351p1 }, + { 0x1.a383ca9f98a0fp-4, 0x1.ef3318a5788dep0 }, + { 0x1.69f16aeb3677p-4, 0x1.ab97c2106c4d2p0 }, + { 0x1.383bf2b37a037p-4, 0x1.712bc1550fb6ap0 }, + { 0x1.0d51cf5a16254p-4, 0x1.3eb13a24821e2p0 }, + { 0x1.d08cdac87dce6p-5, 0x1.131510c1da6adp0 }, + { 0x1.909a7c3ac6f99p-5, 0x1.dad26311e9efp-1 }, + { 0x1.596acfa0bcc8fp-5, 0x1.99bf36c7ef068p-1 }, + { 0x1.29cc13bfd53ap-5, 0x1.618c26c1169a6p-1 }, + { 0x1.00b60212cf113p-5, 0x1.3104d5f799552p-1 }, + { 0x1.ba886ae6e40ep-6, 0x1.071e8b6003b16p-1 }, + { 0x1.7d62a282a4851p-6, 0x1.c5e5338097f6bp-2 }, + { 0x1.48a59e9cb1eb1p-6, 0x1.87730de08c821p-2 }, + { 0x1.1b2abc895a771p-6, 0x1.518db221cf8bap-2 }, + { 0x1.e7e6f4c33ededp-7, 0x1.230ae74a714aap-2 }, + { 0x1.a4480db60fe17p-7, 0x1.f5d1c58fdc6acp-3 }, + { 0x1.69fd19aacb90ap-7, 0x1.b091a88a72f08p-3 }, + { 0x1.37be42e1159e7p-7, 0x1.74d459ba38afep-3 }, + { 0x1.0c707db025298p-7, 0x1.414d114bdcde1p-3 }, + { 0x1.ce3ee3757dbe5p-8, 0x1.14dc49cbc0c3p-3 }, + { 0x1.8df06bfb34f6dp-8, 0x1.dd13408401cdcp-4 }, + { 0x1.568986affafc5p-8, 0x1.9afd0eca1593dp-4 }, + { 0x1.26d009f5af049p-8, 0x1.6203633a6814ap-4 }, + { 0x1.fb69c5d6b524ep-9, 0x1.30e632b0008c9p-4 }, + { 0x1.b49c67cd1611fp-9, 0x1.069124dc6eaefp-4 }, + { 0x1.77a47ec4e9fa1p-9, 0x1.c42b48d5cfe42p-5 }, + { 0x1.43260788f0a1fp-9, 0x1.854b792c33d4ap-5 }, + { 0x1.15f4e018a09eep-9, 0x1.4f1f511f7b2d7p-5 }, + { 0x1.de1c72f739a49p-10, 0x1.2073f996519cp-5 }, + { 0x1.9b25dc6d6642ep-10, 0x1.f08155c194aadp-6 }, + { 0x1.61853cc8eddacp-10, 0x1.ab41e011814e5p-6 }, + { 0x1.2feeed430b87bp-10, 0x1.6f9f62ec4193ap-6 }, + { 0x1.05451535e8102p-10, 0x1.3c45d7f9e2fbp-6 }, + { 0x1.c122bcbda7f8ep-11, 0x1.100ffa10ff0f3p-6 }, + { 0x1.81ff0b26f3b6ap-11, 0x1.d401bee3a7787p-7 }, + { 0x1.4bb153d2d0728p-11, 0x1.927ce5fbbe352p-7 }, + { 0x1.1cfe80beb05a4p-11, 0x1.5a195c6e2a08ep-7 }, + { 0x1.e9ae566e02486p-12, 0x1.2992f3c7d2ce7p-7 }, + { 0x1.a4a3297375461p-12, 0x1.ffa47aef63bd2p-8 }, + { 0x1.6948e77b6c537p-12, 0x1.b7ccca35ce88ep-8 }, + { 0x1.3644eed5b1126p-12, 0x1.79ffc3cd6bc92p-8 }, + { 0x1.0a6cd27d913d7p-12, 0x1.44d7c3dca9cc8p-8 }, + { 0x1.c97f5c053e775p-13, 0x1.1720abf01aa9bp-8 }, + { 0x1.88c0c973b68fcp-13, 0x1.dfa22008cf2c8p-9 }, + { 0x1.512157ee1d8bep-13, 0x1.9c08a63df00dcp-9 }, + { 0x1.215988e86b086p-13, 0x1.61eb258af5a93p-9 }, + { 0x1.f09f2b684fb31p-14, 0x1.2ff68a28f7dc4p-9 }, + { 0x1.aa222a98ba953p-14, 0x1.0506e21782262p-9 }, + { 0x1.6d9b06046eb66p-14, 0x1.c041afe3a1ad2p-10 }, + { 0x1.39a30e3030664p-14, 0x1.80d8271e40929p-10 }, + { 0x1.0d05cd2b64652p-14, 0x1.4a5cc1e67b046p-10 }, + { 0x1.cd740d2318d4dp-15, 0x1.1b8f04bdfa1bfp-10 }, + { 0x1.8bb7603d9828p-15, 0x1.e6b65816f0ff1p-11 }, + { 0x1.534d810db5377p-15, 0x1.a1a7ec86c94fbp-11 }, + { 0x1.22e56de90dc1ap-15, 0x1.665a9398034f1p-11 }, + { 0x1.f2bb06a7069e2p-16, 0x1.336f30c8d3345p-11 }, + { 0x1.ab79b6edb04e1p-16, 0x1.07b7cbf13abf4p-11 }, + { 0x1.6e5b33b150249p-16, 0x1.c461717dacbd8p-12 }, + { 0x1.39f005226a7dbp-16, 0x1.83f56253c12f1p-12 }, + { 0x1.0cfc8192e69bdp-16, 0x1.4cab82baddd6cp-12 }, + { 0x1.cce310b024fd4p-17, 0x1.1d39d04e50424p-12 }, + { 0x1.8acc81455f971p-17, 0x1.e9094beff3587p-13 }, + { 0x1.522570529739fp-17, 0x1.a3308036822dbp-13 }, + { 0x1.219685023e1bep-17, 0x1.67464f8a36affp-13 }, + { 0x1.eff1f945e7f7bp-18, 0x1.33e2c9c277148p-13 }, + { 0x1.a89fa515a2b44p-18, 0x1.07d0b7bb52fc7p-13 }, + { 0x1.6b83bb4ee4348p-18, 0x1.c40cfbd11fd1p-14 }, + { 0x1.372982e2fde1dp-18, 0x1.833ffa698fa8bp-14 }, + { 0x1.0a51297b20ab7p-18, 0x1.4bb29dadf3acp-14 }, + { 0x1.c7d093fb7e463p-19, 0x1.1c147957723bdp-14 }, + { 0x1.8607006600009p-19, 0x1.e6896f5762306p-15 }, + { 0x1.4db1c7b733812p-19, 0x1.a096cc3260668p-15 }, + { 0x1.1d76959a6b622p-19, 0x1.64a7647d3f88ap-15 }, + { 0x1.e858d8b3acc8p-20, 0x1.314deba7bab37p-15 }, + { 0x1.a1a94b14e3d7fp-20, 0x1.0550e92636252p-15 }, + { 0x1.6529df3d1cf1cp-20, 0x1.bf46cd0f972c3p-16 }, + { 0x1.316449a955429p-20, 0x1.7ebd49fbb30eep-16 }, + { 0x1.0517b9e1f89dep-20, 0x1.47796af08285bp-16 }, + { 0x1.be627dddb55d7p-21, 0x1.1827a73755ec7p-16 }, + { 0x1.7d8a7f2a8a2dp-21, 0x1.df49a10ccc568p-17 }, + { 0x1.4613bf000c71dp-21, 0x1.99ee7037b652bp-17 }, + { 0x1.16a45fcb7b882p-21, 0x1.5e9197017791dp-17 }, + { 0x1.dc283bcbe780fp-22, 0x1.2bc40c543e36bp-17 }, + { 0x1.96ca751cac37fp-22, 0x1.004b34180a4a9p-17 }, + { 0x1.5b7cd13179ddep-22, 0x1.b632d58444fadp-18 }, + { 0x1.28cb2cb8b4015p-22, 0x1.768f3e13d3bdcp-18 }, + { 0x1.faedd62dabd96p-23, 0x1.401fa7657909ep-18 }, + { 0x1.b0de982dbf111p-23, 0x1.1190d162109abp-18 }, + { 0x1.7195b2becea19p-23, 0x1.d3803e22a78e4p-19 }, + { 0x1.3b8387eea3f9dp-23, 0x1.8f694ad8ac632p-19 }, + { 0x1.0d521f8291cd6p-23, 0x1.55326d6aac6fap-19 }, + { 0x1.cbb9be9cbac1ep-24, 0x1.236e8d3a9e0e7p-19 }, + { 0x1.8852e54d26542p-24, 0x1.f1ca221c0b98bp-20 }, + { 0x1.4ec36b8fdf428p-24, 0x1.a914b62872bc3p-20 }, + { 0x1.1d9d0055d11dp-24, 0x1.6af2ae42db58p-20 }, + { 0x1.e74cb7ebdea0ap-25, 0x1.35dbe86ed95c7p-20 }, + { 0x1.9fa735b03463ap-25, 0x1.0880cfe68041ep-20 }, + { 0x1.627f6220ca6a9p-25, 0x1.c3847cbf78a3bp-21 }, + { 0x1.2e4d9d8b5b22fp-25, 0x1.81550cf271bfdp-21 }, + { 0x1.01c325e8bb3cp-25, 0x1.48cefa0aac509p-21 }, + { 0x1.b783bc148fcefp-26, 0x1.188ab9ce5fdddp-21 }, + { 0x1.76aa8791eba33p-26, 0x1.dea9996bf1c0fp-22 }, + { 0x1.3f58d390caeecp-26, 0x1.984c7bb9c53ffp-22 }, + { 0x1.10299f255a2cap-26, 0x1.5c3c6ce5f2f75p-22 }, + { 0x1.cfd7e08a13b2p-27, 0x1.28f8faa7c3202p-22 }, + { 0x1.8b368e0429dacp-27, 0x1.fa7304087353p-23 }, + { 0x1.50b2501707be6p-27, 0x1.afca3c464e1d5p-23 }, + { 0x1.1ecf2c897b782p-27, 0x1.701780b38d71ap-23 }, + { 0x1.e891642306feep-28, 0x1.39c08dab159ep-23 }, + { 0x1.a013c6709bdd5p-28, 0x1.0b66dac93672bp-23 }, + { 0x1.624c9a2f2f8fcp-28, 0x1.c7bde43ebd873p-24 }, + { 0x1.2da83d59392f5p-28, 0x1.84520ec5eb55ap-24 }, + { 0x1.00ce3767b77a8p-28, 0x1.4ad54236cf6b4p-24 }, + { 0x1.b5312d520a3f4p-29, 0x1.19d258cf47194p-24 }, + { 0x1.74191dcab90bcp-29, 0x1.e015665e4efbdp-25 }, + { 0x1.3ca855a30dad5p-29, 0x1.98dc92b26aeap-25 }, + { 0x1.0d71d1069e44fp-29, 0x1.5c29c3e79c162p-25 }, + { 0x1.ca7c7b61a5357p-30, 0x1.28708aaed4d7p-25 }, + { 0x1.86083aaabaf73p-30, 0x1.f8bd2046619b5p-26 }, + { 0x1.4bc21b880f9dep-30, 0x1.ada636f165959p-26 }, + { 0x1.1a28183b0e32p-30, 0x1.6dafa60f704a1p-26 }, + { 0x1.dfe23a6ad4f8bp-31, 0x1.37351629c53cp-26 }, + { 0x1.980956bea8ccp-31, 0x1.08cff68f5874cp-26 }, + { 0x1.5ae767663002ep-31, 0x1.c29ce58c1fc1p-27 }, + { 0x1.26e4fd1165b76p-31, 0x1.7f5772973d16cp-27 }, + { 0x1.f54dde2ba8f56p-32, 0x1.4612c5674eed9p-27 }, + { 0x1.aa0af3e698b26p-32, 0x1.15539e864d70fp-27 }, + { 0x1.6a0956d7d1b63p-32, 0x1.d7ad5cdc3741ep-28 }, + { 0x1.339bd6e517d44p-32, 0x1.9110bc4b50f8cp-28 }, + { 0x1.0554f0943ba8cp-32, 0x1.54fb970dbe54ep-28 }, + { 0x1.bbfac9007ec07p-33, 0x1.21dd98bc7de87p-28 }, + { 0x1.791862715d02fp-33, 0x1.ecc34851c9763p-29 }, + { 0x1.403f77382e654p-33, 0x1.a2ca34863bfcbp-29 }, + { 0x1.0feff2a4fc49p-33, 0x1.63e0d12d4d288p-29 }, + { 0x1.cdc5de1ae8c09p-34, 0x1.2e615f0543e41p-29 }, + { 0x1.8804761a993c4p-34, 0x1.00e4ae934cb56p-29 }, + { 0x1.4cc23eb3b5ffap-34, 0x1.b471c42165f4ap-30 }, + { 0x1.1a6c6c06ea18bp-34, 0x1.72b316e47cc93p-30 }, + { 0x1.df58ab9ae4fcbp-35, 0x1.3ad1e7143aa75p-30 }, + { 0x1.96bd0bd6c9a31p-35, 0x1.0b54bd6a9e23fp-30 }, + { 0x1.59163428fb3a6p-35, 0x1.c5f4a785a88d1p-31 }, + { 0x1.24be8d0138113p-35, 0x1.8162809b8dff6p-31 }, + { 0x1.f09f3c1618809p-36, 0x1.4721b76389525p-31 }, + { 0x1.a53148c3fc482p-36, 0x1.15a6678e0082cp-31 }, + { 0x1.652d1d62b45e1p-36, 0x1.d73f8da963966p-32 }, + { 0x1.2eda549c16ee8p-36, 0x1.8fdeb6a9e8ebcp-32 }, + { 0x1.00c2a84aed164p-36, 0x1.5342fe16e83a5p-32 }, + { 0x1.b3501c0fdbbcfp-37, 0x1.1fcdfea216d16p-32 }, + { 0x1.70f8998ccf075p-37, 0x1.e83eb9bce31c4p-33 }, + { 0x1.38b3a7222dd33p-37, 0x1.9e170e2dbff8cp-33 }, + { 0x1.08fb437656229p-37, 0x1.5f27a9aa5f66p-33 }, + { 0x1.c1085f96d9feep-38, 0x1.29bfa42bc7b76p-33 }, + { 0x1.7c6a3cf1c9dcfp-38, 0x1.f8de2739c95a9p-34 }, + { 0x1.423e65b2a3a8cp-38, 0x1.abfaa7d4233fap-34 }, + { 0x1.10ef40de709bcp-38, 0x1.6ac1833360c58p-34 }, + { 0x1.ce48f9d9e5928p-39, 0x1.336f5ff042b88p-34 }, + { 0x1.8773adc5703cep-39, 0x1.0484d7ff5f6bdp-34 }, + { 0x1.4b6e86a5aa9d8p-39, 0x1.b978904649f57p-35 }, + { 0x1.189488e2e9743p-39, 0x1.760249f31a968p-35 }, + { 0x1.db0100ef385d3p-40, 0x1.3cd13761f1731p-35 }, + { 0x1.9206c1ae9fb29p-40, 0x1.0c569a0b1627cp-35 }, + { 0x1.54382e8081943p-40, 0x1.c67fe1e83e91p-36 }, + { 0x1.1fe13002859cap-40, 0x1.80dbcff1d72cfp-36 }, + { 0x1.e71fde0c5e218p-41, 0x1.45d945dc4844dp-36 }, + { 0x1.9c159bbc9900ap-41, 0x1.13da615eb6c5fp-36 }, + { 0x1.5c8fc931c6d94p-41, 0x1.d2ffe78d87996p-37 }, + { 0x1.26cb8c1920344p-41, 0x1.8b4017551e03bp-37 }, + { 0x1.f295714275bc3p-42, 0x1.4e7bd56b77338p-37 }, + { 0x1.a592ca70605e5p-42, 0x1.1b06621cfb60ep-37 }, + { 0x1.646a234bddd88p-42, 0x1.dee83fc205fc8p-38 }, + { 0x1.2d4a498c21371p-42, 0x1.9521701d324dap-38 }, + { 0x1.fd5235020e009p-43, 0x1.56ad77d8efe38p-38 }, + { 0x1.ae71657ff542ep-43, 0x1.21d11201bfbcfp-38 }, + { 0x1.6bbc82f12468ap-43, 0x1.ea290040397f4p-39 }, + { 0x1.3354802504d9ep-43, 0x1.9e7295f29cf91p-39 }, + { 0x1.03a3b07cf84bp-43, 0x1.5e631fb2a96dbp-39 }, + { 0x1.b6a52af7c7202p-44, 0x1.28313d62cbf4fp-39 }, + { 0x1.727cc024d462ap-44, 0x1.f4b2d92a8da6ap-40 }, + { 0x1.38e1c7590edafp-44, 0x1.a726cda9c5fc4p-40 }, + { 0x1.083385f1e344cp-44, 0x1.6592390114765p-40 }, + { 0x1.be229b5ed10ebp-45, 0x1.2e1e1bdc1cff3p-40 }, + { 0x1.78a15c33bf0d1p-45, 0x1.fe77379b5869ap-41 }, + { 0x1.3dea49bdca04dp-45, 0x1.af3202215009fp-41 }, + { 0x1.0c5225e967ce3p-45, 0x1.6c30c15ee186bp-41 }, + { 0x1.c4df14833b32ep-46, 0x1.338f646703f05p-41 }, + { 0x1.7e2197e99732ep-46, 0x1.03b4338f71d3bp-41 }, + { 0x1.4266d76b7e9efp-46, 0x1.b688e02001605p-42 }, + { 0x1.0ff9aa4df55cbp-46, 0x1.72355f261c90fp-42 }, + { 0x1.cad0ea9847218p-47, 0x1.387d609c076c8p-42 }, + { 0x1.82f5884a3c4ffp-47, 0x1.07bcd8d61f54dp-42 }, + { 0x1.4650f71159187p-47, 0x1.bd20f0d88c869p-43 }, + { 0x1.1324c9f973607p-47, 0x1.77977767b819cp-43 }, + { 0x1.cfef7f529f1bfp-48, 0x1.3ce0fee10ae91p-43 }, + { 0x1.8716298a66d68p-48, 0x1.0b4fbeda58aa9p-43 }, + { 0x1.49a2f582864b8p-48, 0x1.c2f0b2bc85943p-44 }, + { 0x1.15cee56fb8f8p-48, 0x1.7c4f426570458p-44 }, + { 0x1.d43356b5d1bc3p-49, 0x1.40b3e347db73ap-44 }, + { 0x1.8a7d700826ce3p-49, 0x1.0e67b4f33d066p-44 }, + { 0x1.4c57f38808af9p-49, 0x1.c7efb04c36011p-45 }, + { 0x1.17f41219f6e6ep-49, 0x1.8055de49eb405p-45 }, + { 0x1.d796294cc09e7p-50, 0x1.43f076e4dac86p-45 }, + { 0x1.8d265709c8b81p-50, 0x1.11003322f9f2ap-45 }, + { 0x1.4e6bf1c869176p-50, 0x1.cc169496c493bp-46 }, + { 0x1.199123dce7f7cp-50, 0x1.83a55fe01c77fp-46 }, + { 0x1.da12f38ef6065p-51, 0x1.4691f56a0b9d1p-46 }, + { 0x1.8f0ced10d0db4p-51, 0x1.131565242338p-46 }, + { 0x1.4fdbda9c9106cp-51, 0x1.cf5f3d25346p-47 }, + { 0x1.1aa3b4e8f3caap-51, 0x1.8638e1112031dp-47 }, + { 0x1.dba6023e1257ap-52, 0x1.489478d82c425p-47 }, + { 0x1.902e5d96b5dc7p-52, 0x1.14a433d21a4e2p-47 }, + { 0x1.50a589affacc9p-52, 0x1.d1c4c912f9acbp-48 }, + { 0x1.1b2a2ba958505p-52, 0x1.880c8cf6ecf16p-48 }, + { 0x1.dc4cfb90a7ce5p-53, 0x1.49f5031dc194p-48 }, + { 0x1.9088f811b7254p-53, 0x1.15aa4ccc2f79bp-48 }, + { 0x1.50c7d151d73dp-53, 0x1.d343a5202c7c4p-49 }, + { 0x1.1b23bebdcda6dp-53, 0x1.891da95a3a6f5p-49 }, + { 0x1.dc06e50abd949p-54, 0x1.4ab18582d9df2p-49 }, + { 0x1.901c34297491p-54, 0x1.1626283914e64p-49 }, + { 0x1.50427d64b1c7dp-54, 0x1.d3d994938f3adp-50 }, + { 0x1.1a9076f0d2e24p-54, 0x1.896a9d7ab89b1p-50 }, + { 0x1.dad425efa38efp-55, 0x1.4ac8e5c7c8723p-50 }, + { 0x1.8ee8b30ca2586p-55, 0x1.16170c969f828p-50 }, + { 0x1.4f1653e256f41p-55, 0x1.d385b6cd88b32p-51 }, + { 0x1.19712f23cae3dp-55, 0x1.88f2f609fe4d3p-51 }, + { 0x1.d8b686448b5afp-56, 0x1.4a3b00e506616p-51 }, + { 0x1.8cf03de32b406p-56, 0x1.157d10888e2f3p-51 }, + { 0x1.4d4512f22a65dp-56, 0x1.d2488978a2f74p-52 }, + { 0x1.17c7923127a39p-56, 0x1.87b7664b4e00cp-52 }, + { 0x1.d5b12a674c804p-57, 0x1.4908ab62a09acp-52 }, + { 0x1.8a35c1621f2ccp-57, 0x1.14591aa0080cap-52 }, + { 0x1.4ad16c988b007p-57, 0x1.d023e74fea7e1p-53 }, + { 0x1.159616cbf8a0cp-57, 0x1.85b9c65443c51p-53 }, + { 0x1.d1c88b489c5c3p-58, 0x1.4733af4601fe1p-53 }, + { 0x1.86bd4690c0845p-58, 0x1.12acdf1c9738cp-53 }, + { 0x1.47bf000e37ae9p-58, 0x1.cd1b037f7490bp-54 }, + { 0x1.12dff96b26d81p-58, 0x1.82fd0e7486194p-54 }, + { 0x1.cd026b64a0ca8p-59, 0x1.44bec79d5416cp-54 }, + { 0x1.828be8d7b2e74p-59, 0x1.107adbae7661dp-54 }, + { 0x1.441250d6b8cc7p-59, 0x1.c93261af2cd0dp-55 }, + { 0x1.0fa934555eb5ap-59, 0x1.7f854fd47e7d3p-55 }, + { 0x1.c765c89feb632p-60, 0x1.41ad99b7fc9ebp-55 }, + { 0x1.7da7c97c8ea4bp-60, 0x1.0dc65148f57fcp-55 }, + { 0x1.3fd0bbb47d67cp-60, 0x1.c46fcad39a071p-56 }, + { 0x1.0bf675e9015a3p-60, 0x1.7b57aa64c1e42p-56 }, + { 0x1.c0facb396944ap-61, 0x1.3e04ac23c3f11p-56 }, + { 0x1.781800b4c5862p-61, 0x1.0a933c1a65e31p-56 }, + { 0x1.3b0069a07f02dp-61, 0x1.beda3eeb5f0a2p-57 }, + { 0x1.07cd15415698ap-61, 0x1.767a404101f5ap-57 }, + { 0x1.b9cab20b7b4acp-62, 0x1.39c95b8dcd835p-57 }, + { 0x1.71e48c82b190ap-62, 0x1.06e649c54a11dp-57 }, + { 0x1.35a840f1bb9bfp-62, 0x1.b879e3daa485dp-58 }, + { 0x1.0333055f872d1p-62, 0x1.70f426b1f5c67p-58 }, + { 0x1.b1dfbc5f13465p-63, 0x1.3501cdad9df5bp-58 }, + { 0x1.6b163d96b3dd9p-63, 0x1.02c4cdfc5722cp-58 }, + { 0x1.2fcfd4e6913cap-63, 0x1.b157f19f267eap-59 }, + { 0x1.fc5d8e0519af3p-64, 0x1.6acd55017e4e2p-59 }, + { 0x1.a945119b38a65p-64, 0x1.2fb4e266d3e9fp-59 }, + { 0x1.63b6a2745bde1p-64, 0x1.fc696b5025168p-60 }, + { 0x1.297f53c6e927fp-64, 0x1.a97e9c202c067p-60 }, + { 0x1.f18eb2ba6357fp-65, 0x1.640e915b3f3eap-60 }, + { 0x1.a006a7219c6a4p-65, 0x1.29ea2353deb28p-60 }, + { 0x1.5bcff1208eb99p-65, 0x1.f278f182d5ccep-61 }, + { 0x1.22bf73da1838dp-65, 0x1.a0f8fae51588p-61 }, + { 0x1.e60853b8b4b65p-66, 0x1.5cc15bf9dbbbbp-61 }, + { 0x1.963124add21cp-66, 0x1.23a9b1f0c9515p-61 }, + { 0x1.536cefa1810b4p-66, 0x1.e7c6162103b4ep-62 }, + { 0x1.1b995f6e584afp-66, 0x1.97d2ef035140ap-62 }, + { 0x1.d9da06644bc9dp-67, 0x1.54efd8e5e8a15p-62 }, + { 0x1.8bd1c79049ec2p-67, 0x1.1cfc34a10ee47p-62 }, + { 0x1.4a98db9bff0e8p-67, 0x1.dc5f9803d5324p-63 }, + { 0x1.1416a031bacf2p-67, 0x1.8e1907994f8d3p-63 }, + { 0x1.cd13f7b7c3414p-68, 0x1.4ca4b88f6234cp-63 }, + { 0x1.80f645203dff7p-68, 0x1.15eac2ce52257p-63 }, + { 0x1.415f515af2672p-68, 0x1.d054eb8db2ad5p-64 }, + { 0x1.0c410a1d6b3cap-68, 0x1.83d8652f7235cp-64 }, + { 0x1.bfc6c8b2d1c95p-69, 0x1.43eb1f8cfdcf1p-64 }, + { 0x1.75acacc068ebep-69, 0x1.0e7ed05fb3af3p-64 }, + { 0x1.37cc328e513e5p-69, 0x1.c3b617ec3cfd6p-65 }, + { 0x1.0422a6340a512p-69, 0x1.791e9c59e2b42p-65 }, + { 0x1.b2036a988beadp-70, 0x1.3ace8dce03fbdp-65 }, + { 0x1.6a0349d192d1ap-70, 0x1.06c218ca5f25ap-65 }, + { 0x1.2deb8d0dae905p-70, 0x1.b69393c895b87p-66 }, + { 0x1.f78b3aa5bebbep-71, 0x1.6df997f6bab1bp-66 }, + { 0x1.a3dafb67a96cfp-71, 0x1.315ac58b7d6b7p-66 }, + { 0x1.5e0885ebd9cc3p-71, 0x1.fd7d13f78002dp-67 }, + { 0x1.23c981e88b022p-71, 0x1.a8fe21d205ebp-67 }, + { 0x1.e66846a73c925p-72, 0x1.62777b62fde0cp-67 }, + { 0x1.955ea2f392221p-72, 0x1.279bb2446baf4p-67 }, + { 0x1.51cacbb42476ep-72, 0x1.ecfc5eb955129p-68 }, + { 0x1.19722d0b598a4p-72, 0x1.9b06ad8cbcafbp-68 }, + { 0x1.d4f0c5733dbc9p-73, 0x1.56a684fe99fcap-68 }, + { 0x1.869f70ffc1fcbp-73, 0x1.1d9d500e92622p-68 }, + { 0x1.45586a9e82938p-73, 0x1.dc163a555fefbp-69 }, + { 0x1.0ef18dbc017ffp-73, 0x1.8cbe28ca7c426p-69 }, + { 0x1.c338d2435fb4bp-74, 0x1.4a94f1540c9eap-69 }, + { 0x1.77ae3cb88b469p-74, 0x1.136b93820fc76p-69 }, + { 0x1.38bf7be87e681p-74, 0x1.cadeb8c3bba05p-70 }, + { 0x1.0453702b9a5bbp-74, 0x1.7e356a2db5e15p-70 }, + { 0x1.b154294e891dap-75, 0x1.3e50df3387f95p-70 }, + { 0x1.689b85dc875b1p-75, 0x1.09125281c373ap-70 }, + { 0x1.2c0dc90fab5bap-75, 0x1.b969aedac7779p-71 }, + { 0x1.f346b0aa94647p-76, 0x1.6f7d0d10edd84p-71 }, + { 0x1.9f5604d9610bp-76, 0x1.31e8350b95daep-71 }, + { 0x1.597757e14e4e8p-76, 0x1.fd3a5c3ac18bbp-72 }, + { 0x1.1f50b401397f7p-76, 0x1.a7ca8fa24018p-72 }, + { 0x1.ddd8dcb76e388p-77, 0x1.60a5532471804p-72 }, + { 0x1.8d50fcdd2a012p-77, 0x1.256887c26e498p-72 }, + { 0x1.4a512f5483d32p-77, 0x1.e82efb884fa7p-73 }, + { 0x1.129521372a709p-77, 0x1.961449f1f5f93p-73 }, + { 0x1.c872d91eff745p-78, 0x1.51be080b9d49dp-73 }, + { 0x1.7b56e9895b756p-78, 0x1.18df034ba2c47p-73 }, + { 0x1.3b37e1b01d1bdp-78, 0x1.d31877f1753bap-74 }, + { 0x1.05e763ef1c6e1p-78, 0x1.845928aac023dp-74 }, + { 0x1.b3291e83a6ddap-79, 0x1.42d6673958cf7p-74 }, + { 0x1.6978c8d7d61b8p-79, 0x1.0c58552d896bdp-74 }, + { 0x1.2c3987ce2b431p-79, 0x1.be0be95f0126ep-75 }, + { 0x1.f2a6593b4ee39p-80, 0x1.72aab5cc51918p-75 }, + { 0x1.9e0f0cfd57ab4p-80, 0x1.33fd04413c4e8p-75 }, + { 0x1.57c6a75ebbd36p-80, 0x1.ffc132424c87ap-76 }, + { 0x1.1d636b1da2b46p-80, 0x1.a91d6af35687bp-76 }, + { 0x1.d9c6f3705063cp-81, 0x1.6119a09e14fe5p-76 }, + { 0x1.8936d384f421ap-81, 0x1.253fb5c838ba6p-76 }, + { 0x1.464f8c7e074fcp-81, 0x1.e7068fdcaeb4ep-77 }, + { 0x1.0ec1f5aebc21fp-81, 0x1.945fff2eb1b17p-77 }, + { 0x1.c14515cb6f8fp-82, 0x1.4fb5a7146299ap-77 }, + { 0x1.74b15b6eeceb1p-82, 0x1.16ab8334ccb0ap-77 }, + { 0x1.352169fa33216p-82, 0x1.ce965139dad89p-78 }, + { 0x1.0060a522d6818p-82, 0x1.7fe578074e0c8p-78 }, + { 0x1.a933ad3e37ea3p-83, 0x1.3e8d828e807b4p-78 }, + { 0x1.608e37fe916b7p-83, 0x1.084c9533fea9dp-78 }, + { 0x1.24490f08ca22dp-83, 0x1.b68488148e38cp-79 }, + { 0x1.e4940102c0a26p-84, 0x1.6bbe630bdc58cp-79 }, + { 0x1.91a40479b1837p-84, 0x1.2daed7fd23569p-79 }, + { 0x1.4cdb9a0d20ef7p-84, 0x1.f45c523b5ec4ep-80 }, + { 0x1.13d21ec7ce7a5p-84, 0x1.9ee3b5d440d2p-80 }, + { 0x1.c90f21d2d475fp-85, 0x1.57f9f997e1f52p-80 }, + { 0x1.7aa5b8d4b4359p-85, 0x1.1d262b74c69e4p-80 }, + { 0x1.39a647b21bed6p-85, 0x1.d8b50e711660ap-81 }, + { 0x1.03c70a0dadb1dp-85, 0x1.87c4bc616ed3dp-81 }, + { 0x1.ae43ba1c85bb1p-86, 0x1.44a615135e868p-81 }, + { 0x1.6446b3db12c58p-86, 0x1.0cfed72363bb7p-81 }, + { 0x1.26f997cdc041dp-86, 0x1.bdb5f7a82d0f4p-82 }, + { 0x1.e86218ea3e6acp-87, 0x1.7136d3b897e11p-82 }, + { 0x1.9440cec9f5e3ap-87, 0x1.31cf2729ac24dp-82 }, + { 0x1.4e93295651e9bp-87, 0x1.fa860b2bf75f8p-83 }, + { 0x1.14df714b2cc27p-87, 0x1.a36fa64c5b19fp-83 }, + { 0x1.ca3058fde005fp-88, 0x1.5b478418ed951p-83 }, + { 0x1.7b135dc219792p-88, 0x1.1f8035d726d41p-83 }, + { 0x1.3995999427ba7p-88, 0x1.dbf75e60682c2p-84 }, + { 0x1.03604de581436p-88, 0x1.89f0afa1deecap-84 }, + { 0x1.ad067d36fa2c8p-89, 0x1.4602a49df0a52p-84 }, + { 0x1.62c6642f5d4b9p-89, 0x1.0dc2db21eaf21p-84 }, + { 0x1.2556d7a42568ap-89, 0x1.be61355e30a98p-85 }, + { 0x1.e5068065139bep-90, 0x1.7145a7dd1cf8cp-85 }, + { 0x1.90efd5cd13c3p-90, 0x1.31725e0702649p-85 }, + { 0x1.4b62e9374c452p-90, 0x1.f93e90900fd6bp-86 }, + { 0x1.11de133cc6916p-90, 0x1.a1d0c10ff74dfp-86 }, + { 0x1.c49bf95c5f745p-91, 0x1.597928f3e0c7p-86 }, + { 0x1.75f56ab48bd89p-91, 0x1.1d9f316556fccp-86 }, + { 0x1.34f00cbd8ea42p-91, 0x1.d8389849eaf01p-87 }, + { 0x1.fe61cbe17950dp-92, 0x1.8650e1db268ebp-87 }, + { 0x1.a589caf82618cp-92, 0x1.4293ddcb013c1p-87 }, + { 0x1.5c1e107375834p-92, 0x1.0a90025fd130cp-87 }, + { 0x1.1f7319c565581p-92, 0x1.b87eb911fc5efp-88 }, + { 0x1.daa6c6af5c17fp-93, 0x1.6bea387f6b0ap-88 }, + { 0x1.87d63120a742cp-93, 0x1.2c9c915a28ddap-88 }, + { 0x1.436e80df031fp-93, 0x1.f094496a5e827p-89 }, + { 0x1.0aef9bffa708dp-93, 0x1.9a19446f657ccp-89 }, + { 0x1.b890579385cdcp-94, 0x1.52a33b4b8094cp-89 }, + { 0x1.6b84ffdb5d885p-94, 0x1.179841589cdp-89 }, + { 0x1.2be9773700384p-94, 0x1.cda2d93f291abp-90 }, + { 0x1.eecef0206652cp-95, 0x1.7d0e0e7cac5bp-90 }, + { 0x1.9821029662ccfp-95, 0x1.3a804f20fd2f4p-90 }, + { 0x1.5097c74b3d08ep-95, 0x1.038a34010e13fp-90 }, + { 0x1.158fcf12f6c8ep-95, 0x1.ac508371be502p-91 }, + { 0x1.c9b60c296975dp-96, 0x1.61608ea10db83p-91 }, + { 0x1.7958bc88e6006p-96, 0x1.2383e3bce375p-91 }, + { 0x1.370dfa8e149d1p-96, 0x1.e0e820ef7463p-92 }, + { 0x1.0060a594f59c7p-96, 0x1.8c9f67fa9c048p-92 }, + { 0x1.a6925bee98d74p-97, 0x1.471203b047e85p-92 }, + { 0x1.5c351b499632p-97, 0x1.0dae92b93887p-92 }, + { 0x1.1ee518d278c58p-97, 0x1.bcabf2ba981bfp-93 }, + { 0x1.d8b2f8b0b2924p-98, 0x1.6e8f25135d13fp-93 }, + { 0x1.855f0a34582a6p-98, 0x1.2e219acb023aep-93 }, + { 0x1.40b1881e58e3p-98, 0x1.f1fe817902cebp-94 }, + { 0x1.0818d80634105p-98, 0x1.9a5d5233d8e13p-94 }, + { 0x1.b2ecbb2e8d76cp-99, 0x1.521d0766f8b85p-94 }, + { 0x1.6614d9da549fbp-99, 0x1.168c985c93c95p-94 }, + { 0x1.26c7736a63e7fp-99, 0x1.cae6809d7d445p-95 }, + { 0x1.e546a107b57d5p-100, 0x1.79f71edd3cb51p-95 }, + { 0x1.8f64020effd9cp-100, 0x1.37443c37e4835p-95 }, + { 0x1.48aa64075b15p-100, 0x1.004e8297ce819p-95 }, + { 0x1.0e6e891142764p-100, 0x1.a60ceba01346ap-96 }, + { 0x1.bcfa525d16889p-101, 0x1.5b71dfbe662f9p-96 }, + { 0x1.6e0be1ed4e4ccp-101, 0x1.1dfe04c5b884ap-96 }, + { 0x1.2d14568fa3103p-101, 0x1.d6c299b6b03dep-97 }, + { 0x1.ef39c9c67da7p-102, 0x1.8366f8264d161p-97 }, + { 0x1.973b86e9a718fp-102, 0x1.3ec401194be5fp-97 }, + { 0x1.4ed55e6d4d5dfp-102, 0x1.0641ea45be131p-97 }, + { 0x1.1345b1de4a541p-102, 0x1.af7b06dd7c2fap-98 }, + { 0x1.c48e8cf8e20edp-103, 0x1.62e7924beab28p-98 }, + { 0x1.73f6cd7db5a56p-103, 0x1.23e2123cac1dcp-98 }, + { 0x1.31afb2e91937bp-103, 0x1.e00be39adba8fp-99 }, + { 0x1.f6600b76754fcp-104, 0x1.8ab4ee2717624p-99 }, + { 0x1.9cc2881babafp-104, 0x1.447fa5b4e25fep-99 }, + { 0x1.5316d5b010b17p-104, 0x1.0abf02c055867p-99 }, + { 0x1.1688993cfebe3p-104, 0x1.b67d9f35f4de8p-100 }, + { 0x1.c98758b0a4ebap-105, 0x1.685ccfe1e2ab5p-100 }, + { 0x1.77baf72da4868p-105, 0x1.281e65593d67p-100 }, + { 0x1.3484c1e2418cbp-105, 0x1.e698bd1000fd2p-101 }, + { 0x1.fa991c211034p-106, 0x1.8fc0326c87b11p-101 }, + { 0x1.9fe006460b912p-106, 0x1.485d5ed97243ep-101 }, + { 0x1.555b844a27ecdp-106, 0x1.0db191585c5a2p-101 }, + { 0x1.182875c9f3984p-106, 0x1.baf50ff65044dp-102 }, + { 0x1.cbce2423a80acp-107, 0x1.6bb8ebe73c54ap-102 }, + { 0x1.794741d4d28c6p-107, 0x1.2a9fd1221e357p-102 }, + { 0x1.3586a18110b0ep-107, 0x1.ea4b746dbeae3p-103 }, + { 0x1.fbd1c1dcb3991p-108, 0x1.9271dfe5687e7p-103 }, + { 0x1.a085cf5d6c87ep-108, 0x1.4a4b9ae2c857dp-103 }, + { 0x1.559911f8b7812p-108, 0x1.0f0c2d578f06ap-103 }, + { 0x1.181ddd71c27fbp-108, 0x1.bccd0201398bap-104 }, + { 0x1.cb5889458c00ep-109, 0x1.6cec95dfef21ap-104 }, + { 0x1.789499da6bff1p-109, 0x1.2b5ae7721763fp-104 }, + { 0x1.34b0b5ddf82c6p-109, 0x1.eb1327842cc63p-105 }, + { 0x1.fa04646636ebep-110, 0x1.92bda7bca05b7p-105 }, + { 0x1.9eb0ea42d451ep-110, 0x1.4a4186866270ap-105 }, + { 0x1.53ce6234f7db7p-110, 0x1.0ec8a57831ec5p-105 }, + { 0x1.1668fdbb007d5p-110, 0x1.bbfd05e1b64f3p-106 }, + { 0x1.c8289c5fd0187p-111, 0x1.6bf24d893426cp-106 }, + { 0x1.75a62b0407aefp-111, 0x1.2a4c4fb42b862p-106 }, + { 0x1.3206cc37b0e4ap-111, 0x1.e8ec43d273fbap-107 }, + { 0x1.f53937c26236ep-112, 0x1.90a22ee0d506ep-107 }, + { 0x1.9a69ad7793258p-112, 0x1.483f4fee6553cp-107 }, + { 0x1.50039cbf56e41p-112, 0x1.0ce82f0139653p-107 }, + { 0x1.13119a81ee824p-112, 0x1.b888d3fea2a71p-108 }, + { 0x1.c24cdc6a6909bp-113, 0x1.68ce8cbb7eaebp-108 }, + { 0x1.7089487e1182ep-113, 0x1.2778e05f0f826p-108 }, + { 0x1.2d94fe2dcd5a4p-113, 0x1.e3e0a1bcb7b9p-109 }, + { 0x1.ed85fe218f015p-114, 0x1.8c29185861611p-109 }, + { 0x1.93c37ffa2be3p-114, 0x1.444e2559eb861p-109 }, + { 0x1.4a49efe08b764p-114, 0x1.09735c9244f77p-109 }, + { 0x1.0e26d33274acdp-114, 0x1.b28030446d467p-110 }, + { 0x1.b9dfc560135fp-115, 0x1.638fa554a9791p-110 }, + { 0x1.6955081ac80b2p-115, 0x1.22ed7a20d2031p-110 }, + { 0x1.276f565251c73p-115, 0x1.dc07399fb9ebdp-111 }, + { 0x1.e30d639687648p-116, 0x1.8566bbf3afdccp-111 }, + { 0x1.8adc46e842374p-116, 0x1.3e7fef514c8f7p-111 }, + { 0x1.42bb0eedd3fb2p-116, 0x1.0479dd0162987p-111 }, + { 0x1.07beb0edff1b8p-116, 0x1.a9fe7272a642bp-112 }, + { 0x1.af070915be74ep-117, 0x1.5c4d5495043b3p-112 }, + { 0x1.602994f04daa5p-117, 0x1.1cbea64272b5fp-112 }, + { 0x1.1fb139d7ad13p-117, 0x1.d18375dee0b86p-113 }, + { 0x1.d5fdfa65dd70dp-118, 0x1.7c798c690caf6p-113 }, + { 0x1.7fdb85ec65bd4p-118, 0x1.36eec953c25e3p-113 }, + { 0x1.39787263ebbcap-118, 0x1.fc2409fc1812ep-114 }, + { 0x1.ffeb0495cc103p-119, 0x1.9f29b80329143p-114 }, + { 0x1.a1f276c1aeb71p-119, 0x1.5328106ecc8f8p-114 }, + { 0x1.552f40714fe54p-119, 0x1.1507fc4d2f4bap-114 }, + { 0x1.167c9d827337cp-119, 0x1.c484291d11ffp-115 }, + { 0x1.c690e28b6a9bfp-120, 0x1.7189333483e3bp-115 }, + { 0x1.72f13b97db104p-120, 0x1.2dbc3e931f24dp-115 }, + { 0x1.2eaa616a9b21cp-120, 0x1.ecb050b3055ap-116 }, + { 0x1.edda16b7edc87p-121, 0x1.9231c8255bcdbp-116 }, + { 0x1.92da9c960076ap-121, 0x1.4848161f4e509p-116 }, + { 0x1.48955baf138afp-121, 0x1.0beb55467080ap-116 }, + { 0x1.0bf90e157d9dap-121, 0x1.b542338309321p-117 }, + { 0x1.b5082a5d8de09p-122, 0x1.64c56b8fb3cecp-117 }, + { 0x1.6454856772fedp-122, 0x1.231052b5f7dd6p-117 }, + { 0x1.227ecea87251dp-122, 0x1.dadb937ed07ebp-118 }, + { 0x1.d99724acabf71p-123, 0x1.834eb55a1d18ep-118 }, + { 0x1.81ff31715569ap-123, 0x1.3bdc43dd8955fp-118 }, + { 0x1.3a90e48619574p-123, 0x1.018fd4cd15479p-118 }, + { 0x1.005296113b586p-123, 0x1.a3fee5158c03fp-119 }, + { 0x1.a1acf8c750894p-124, 0x1.5664a8518a142p-119 }, + { 0x1.54421936100c1p-124, 0x1.171860917e7c8p-119 }, + { 0x1.152813e135602p-124, 0x1.c6f152728fb8fp-120 }, + { 0x1.c375a4cba7b23p-125, 0x1.72bf4ab4db677p-120 }, + { 0x1.6fa5568fa20f3p-125, 0x1.2e18c95c4bfb1p-120 }, + { 0x1.2b5b13ef0805cp-125, 0x1.ec41a3d4cf576p-121 }, + { 0x1.e77117811a7d2p-126, 0x1.91022d83bf8f5p-121 }, + { 0x1.8ccd934db2cbp-126, 0x1.46a292659269ep-121 }, + { 0x1.42faa33070d2ap-126, 0x1.0a05da41d6048p-121 }, + { 0x1.06db98d7f6125p-126, 0x1.b14375f322de2p-122 }, + { 0x1.abcdbdfcc9f7cp-127, 0x1.60c75486158bp-122 }, + { 0x1.5c15c23fbb403p-127, 0x1.1f35bc35fb59fp-122 }, + { 0x1.1b2fdb7cab6dfp-127, 0x1.d39954e0a9d3dp-123 }, + { 0x1.ccb8a64624f6cp-128, 0x1.7c98ab66270f5p-123 }, + { 0x1.76bb52e82b59ap-128, 0x1.35be6eb898758p-123 }, + { 0x1.30c117f001ac3p-128, 0x1.f819edd38db9cp-124 }, + { 0x1.efa0e49e3feccp-129, 0x1.9a2821242ebdp-124 }, + { 0x1.92fa046d58d4ep-129, 0x1.4dadd528d6ea9p-124 }, + { 0x1.479ae4e865feep-129, 0x1.0f6d9e092345cp-124 }, + { 0x1.0a4c603089f16p-129, 0x1.b987187720ae4p-125 }, + { 0x1.b0e03e96a5485p-130, 0x1.6711ad9310ce1p-125 }, + { 0x1.5fc89a9e03199p-130, 0x1.23f97aea9f29fp-125 }, + { 0x1.1dd90a3522c75p-130, 0x1.dac6b554960ffp-126 }, + { 0x1.d07c0b8b30398p-131, 0x1.81f77dc55f2bdp-126 }, + { 0x1.795540ea5dda7p-131, 0x1.39bb36d1a51dap-126 }, + { 0x1.327f191dd6247p-131, 0x1.fdf7c425dfb89p-127 }, + { 0x1.f1db008e061d6p-132, 0x1.9e6c7f42ee3ap-127 }, + { 0x1.944b7c8850269p-132, 0x1.50bd38f4b0e14p-127 }, + { 0x1.4846e1e475567p-132, 0x1.11954fcd9d596p-127 }, + { 0x1.0a8512d6deebp-132, 0x1.bc7d8a23288e1p-128 }, + { 0x1.b0b57b848dfd5p-133, 0x1.69099571fea27p-128 }, + { 0x1.5f385601a1095p-133, 0x1.25378a982372p-128 }, + { 0x1.1d0aee3f21eaep-133, 0x1.dc36feecfa2bap-129 }, + { 0x1.ce9ce0f1b56b8p-134, 0x1.82a9fb7ad076bp-129 }, + { 0x1.775af322a6fb6p-134, 0x1.39ea243c7bf71p-129 }, + { 0x1.3084e2fb958e5p-134, 0x1.fda4af81b306ap-130 }, + { 0x1.ee0aaff5c7275p-135, 0x1.9da7a2c5ab52cp-130 }, + { 0x1.90b5b261712acp-135, 0x1.4fb44aa933f5cp-130 }, + { 0x1.44f853ca3d2a1p-135, 0x1.1068e39733d5fp-130 }, + { 0x1.07839b24e2329p-135, 0x1.ba0b385a9673fp-131 }, + { 0x1.ab4ef712ea53cp-136, 0x1.669cb88b98bb4p-131 }, + { 0x1.5a6a27edc2aafp-136, 0x1.22e458ff074e2p-131 }, + { 0x1.18ccfb2383c0dp-136, 0x1.d7dccacf16bdfp-132 }, + { 0x1.c72c7d427b5c7p-137, 0x1.7ea9a57d9c3fdp-132 }, + { 0x1.70debd3477d7cp-137, 0x1.364981b4fcaccp-132 }, + { 0x1.2ae4c8505c4dcp-137, 0x1.f723b60a4c45ap-133 }, + { 0x1.e45347f37826dp-138, 0x1.97e0b5db827a8p-133 }, + { 0x1.8859d9d834871p-138, 0x1.4a9cae44d02aap-133 }, + { 0x1.3dcdd6f53a761p-138, 0x1.0bf347561e06fp-133 }, + { 0x1.0163c7a1b8ce3p-138, 0x1.b246ea577dcd5p-134 }, + { 0x1.a0de9e4d0326ap-139, 0x1.5fe1a8f2ffd47p-134 }, + { 0x1.518a7407eb90ep-139, 0x1.1d15869af1a46p-134 }, + { 0x1.1146574533e59p-139, 0x1.cde08f63664fdp-135 }, + { 0x1.ba6f77161f191p-140, 0x1.761ba88bf6eedp-135 }, + { 0x1.661c59f17faep-140, 0x1.2efafc89163c3p-135 }, + { 0x1.21d2894bdd4c7p-140, 0x1.eab12c8aa7e5p-136 }, + { 0x1.d50e0eba3e44dp-141, 0x1.8d4d432dee077p-136 }, + { 0x1.7b84a5753cf1fp-141, 0x1.41a589d11cb19p-136 }, + { 0x1.33091416396dbp-141, 0x1.045db9ec2ba81p-136 }, + { 0x1.f0bb3ff173143p-142, 0x1.a57861242277fp-137 }, + { 0x1.91c3cacc75aaap-142, 0x1.551681b8d361p-137 }, + { 0x1.44ea256a84bbp-142, 0x1.140098b38820cp-137 }, + { 0x1.06bb841410434p-142, 0x1.be9e2feb561ep-138 }, + { 0x1.a8d98b0d5771p-143, 0x1.694e9fdcb7be5p-138 }, + { 0x1.57755a2313bdfp-143, 0x1.24419d9ce37ffp-138 }, + { 0x1.15a03d39bca43p-143, 0x1.d8bf1578b3aacp-139 }, + { 0x1.c0c4e9f387792p-144, 0x1.7e4dfe2cee6a2p-139 }, + { 0x1.6aa9b63079411p-144, 0x1.3520b0bf08a51p-139 }, + { 0x1.250ad98a67e4fp-144, 0x1.f3daa3dd37f3ap-140 }, + { 0x1.d9842421f4af1p-145, 0x1.94140b3abb78ep-140 }, + { 0x1.7e859d0226582p-145, 0x1.469d2facc66f7p-140 }, + { 0x1.34f9e5d4c96d3p-145, 0x1.07f7c6b04c092p-140 }, + { 0x1.f314a5f5af6d7p-146, 0x1.aa9f80ec12e52p-141 }, + { 0x1.9306ca687d568p-146, 0x1.58b5e63278412p-141 }, + { 0x1.456b681315dafp-146, 0x1.167dcc97a0fd3p-141 }, + { 0x1.06b98180e66fp-146, 0x1.c1ee5bab4ede7p-142 }, + { 0x1.a82a4c036e3f3p-147, 0x1.6b69077bfc3c7p-142 }, + { 0x1.565cda5d05a6ap-147, 0x1.257dcc5bc2717p-142 }, + { 0x1.144d77262f022p-147, 0x1.d9fdd2296338fp-143 }, + { 0x1.bdec7b50a66cp-148, 0x1.7eb427b4ddd71p-143 }, + { 0x1.67cb265d8483ap-148, 0x1.34f5aee91217p-143 }, + { 0x1.224399b226996p-148, 0x1.f2ca4dc8ff69fp-144 }, + { 0x1.d448f86c23d12p-149, 0x1.92943634830d2p-144 }, + { 0x1.79b2a15ae0faap-149, 0x1.44e2d8e947442p-144 }, + { 0x1.3098d833c2dap-149, 0x1.0627b1e47c261p-144 }, + { 0x1.eb3aa595948f3p-150, 0x1.a705784809825p-145 }, + { 0x1.8c0f08dff4e68p-150, 0x1.554226cd542efp-145 }, + { 0x1.3f49a8880f6adp-150, 0x1.1343e7a202e9p-145 }, + { 0x1.015dd1c62a082p-150, 0x1.bc0384ab3550dp-146 }, + { 0x1.9edb80143a705p-151, 0x1.660fe966c4e28p-146 }, + { 0x1.4e52056f2dec4p-151, 0x1.20b6b60dae611p-146 }, + { 0x1.0d62a769875ep-151, 0x1.d1893fc15ba16p-147 }, + { 0x1.b2128dd015485p-152, 0x1.7747e31ddd25cp-147 }, + { 0x1.5dad6d3a16694p-152, 0x1.2e7c997078049p-147 }, + { 0x1.19a81ef58dfc6p-152, 0x1.e790d89e8e564p-148 }, + { 0x1.c5ae1b79c4ee8p-153, 0x1.88e545d12ba57p-148 }, + { 0x1.6d56e11abc8a7p-153, 0x1.3c919aea9787p-148 }, + { 0x1.262a204b39df1p-153, 0x1.fe13c6f07b6aep-149 }, + { 0x1.d9a774b67b183p-154, 0x1.9ae2b16a9550ap-149 }, + { 0x1.7d48e51f6d6edp-154, 0x1.4af14f857334ep-149 }, + { 0x1.32e43016e50e4p-154, 0x1.0a8564eab8ff5p-149 }, + { 0x1.edf747f9f14f1p-155, 0x1.ad3a33350402p-150 }, + { 0x1.8d7d80e14b91p-155, 0x1.5996d7e13f467p-150 }, + { 0x1.3fd1708b687cbp-155, 0x1.1636f3d76858ap-150 }, + { 0x1.014ad3fec9ec4p-155, 0x1.bfe545fce7a55p-151 }, + { 0x1.9dee40ecc2982p-156, 0x1.687ce08618977p-151 }, + { 0x1.4ceca2b27454p-156, 0x1.221a377d62eb4p-151 }, + { 0x1.0bbd071377b87p-156, 0x1.d2dcd30499eb7p-152 }, + { 0x1.ae9438e9a5c0bp-157, 0x1.779da2df7a30cp-152 }, + { 0x1.5a30285652adp-157, 0x1.2e2a7c1fe1c5fp-152 }, + { 0x1.164daef1c2b15p-157, 0x1.e61933d473856p-153 }, + { 0x1.bf6806876a635p-158, 0x1.86f2e6e7e582ap-153 }, + { 0x1.67960688424efp-158, 0x1.3a62b4892ce6ep-153 }, + { 0x1.20f7f47f404a7p-158, 0x1.f99234ed0089ep-154 }, + { 0x1.d061d530972c5p-159, 0x1.9676058974913p-154 }, + { 0x1.7517e8c57f622p-159, 0x1.46bd7c1e28efp-154 }, + { 0x1.2bb6ba79809edp-159, 0x1.069f8cb02119fp-154 }, + { 0x1.e17962871247p-160, 0x1.a61febb6d574dp-155 }, + { 0x1.82af24bbe81ddp-160, 0x1.53351984f5d61p-155 }, + { 0x1.3684a09debb18p-160, 0x1.108b4faaa8971p-155 }, + { 0x1.f2a603a977e7cp-161, 0x1.b5e91e3ee196dp-156 }, + { 0x1.9054beadf5a51p-161, 0x1.5fc381e001854p-156 }, + { 0x1.415c074fc9065p-161, 0x1.1a8782bc000bep-156 }, + { 0x1.01ef55a0092e3p-161, 0x1.c5c9be5ba37d4p-157 }, + { 0x1.9e016e74801cbp-162, 0x1.6c625c9dd5c05p-157 }, + { 0x1.4c3713bae315dp-162, 0x1.248f08aa2a9f5p-157 }, + { 0x1.0a8cf82738469p-162, 0x1.d5b98efc2e8d5p-158 }, + { 0x1.abada51b7b47ep-163, 0x1.790b07dcc17ddp-158 }, + { 0x1.570fb47030aa8p-163, 0x1.2e9c8b4dec3dep-158 }, + { 0x1.13270ae279a57p-163, 0x1.e5affac730013p-159 }, + { 0x1.b951931589ad6p-164, 0x1.85b69d604d483p-159 }, + { 0x1.61dfa678e3296p-164, 0x1.38aa7fa8655e3p-159 }, + { 0x1.1bb88966006c4p-164, 0x1.f5a41ad29abd6p-160 }, + { 0x1.c6e52f00f28e6p-165, 0x1.925df815332e1p-160 }, + { 0x1.6ca07adb2cabep-165, 0x1.42b32a68b6433p-160 }, + { 0x1.243c4de072741p-165, 0x1.02c65f05a223cp-160 }, + { 0x1.d4603cf73627ep-166, 0x1.9ef9ba1f58105p-161 }, + { 0x1.774b9c8b0652p-166, 0x1.4cb0a4ddc2264p-161 }, + { 0x1.2cad15ed5f00dp-166, 0x1.0ab038a2ddd17p-161 }, + { 0x1.e1ba565f2f2dap-167, 0x1.ab82536c08c11p-162 }, + { 0x1.81da56c03901cp-167, 0x1.569ce24f30cadp-162 }, + { 0x1.350587b61e2e7p-167, 0x1.128ac3f80b9acp-162 }, + { 0x1.eeeaf2386ba73p-168, 0x1.b7f008c184953p-163 }, + { 0x1.8c45dba9ebaffp-168, 0x1.6071b5b7d5f0bp-163 }, + { 0x1.3d40375ab2fc9p-168, 0x1.1a5112ad78884p-163 }, + { 0x1.fbe96dd52dd2ap-169, 0x1.c43afb43abf3ap-164 }, + { 0x1.96874b77050b3p-169, 0x1.6a28d7dab475p-164 }, + { 0x1.4557ac9b8a4ffp-169, 0x1.21fe234726979p-164 }, + { 0x1.04568afbad70bp-169, 0x1.d05b30647f5b6p-165 }, + { 0x1.a097bba9c5bbap-170, 0x1.73bbedaae952fp-165 }, + { 0x1.4d4668bc3c638p-170, 0x1.298ce64edbc52p-165 }, + { 0x1.0a969821c25d4p-170, 0x1.dc489a35fd89p-166 }, + { 0x1.aa703eac27071p-171, 0x1.7d248efdebaf1p-166 }, + { 0x1.5506ec96ce1d8p-171, 0x1.30f843b6c62b7p-166 }, + { 0x1.10b0827e1c59fp-171, 0x1.e7fb2011e1175p-167 }, + { 0x1.b409eb99c2287p-172, 0x1.865c4d7ebd336p-167 }, + { 0x1.5c93bed6568e9p-172, 0x1.383b206d0bb99p-167 }, + { 0x1.169ff47b694c6p-172, 0x1.f36aa78ac249dp-168 }, + { 0x1.bd5de633517f7p-173, 0x1.8f5cbbd7e3bd9p-168 }, + { 0x1.63e7724f64774p-173, 0x1.3f5064180659dp-168 }, + { 0x1.1c60a3dd2224ep-173, 0x1.fe8f1d993bb19p-169 }, + { 0x1.c66566ef40333p-174, 0x1.981f750955121p-169 }, + { 0x1.6afcac6c09d1ap-174, 0x1.4632fef2669ecp-169 }, + { 0x1.21ee56dbc8c6ap-174, 0x1.04b03ffb7174ap-169 }, + { 0x1.cf19c31a391acp-175, 0x1.a09e23dee12dbp-170 }, + { 0x1.71ce2ba111a68p-175, 0x1.4cddefbe00daep-170 }, + { 0x1.2744e94597dfp-175, 0x1.09eb734c1a314p-170 }, + { 0x1.d77474fa3c96fp-176, 0x1.a8d28a7b21f9ep-171 }, + { 0x1.7856cde19858bp-176, 0x1.534c49c3a48ap-171 }, + { 0x1.2c60519b06073p-176, 0x1.0ef5469afe541p-171 }, + { 0x1.df6f23e67822ep-177, 0x1.b0b689ea896fp-172 }, + { 0x1.7e9197060941ap-177, 0x1.59793ad60d8abp-172 }, + { 0x1.313ca61e59763p-177, 0x1.13c9ee6b2a529p-172 }, + { 0x1.e703ac45eb1a5p-178, 0x1.b84429b1d33d8p-173 }, + { 0x1.8479b71b66ff2p-178, 0x1.5f60114dc317ap-173 }, + { 0x1.35d621cd7892fp-178, 0x1.1865baa279b03p-173 }, + { 0x1.ee2c2766d39aep-179, 0x1.bf759f4ae6481p-174 }, + { 0x1.8a0a908fbee34p-179, 0x1.64fc41f392bcdp-174 }, + { 0x1.3a29293d26666p-179, 0x1.1cc51b3533d1bp-174 }, + { 0x1.f4e2f320ed2f5p-180, 0x1.c645558315ad7p-175 }, + { 0x1.8f3fbe30bc1d8p-180, 0x1.6a496dcf4682p-175 }, + { 0x1.3e324f4cf0981p-180, 0x1.20e4a4b8e031ep-175 }, + { 0x1.fb22b934b993p-181, 0x1.ccadf3adb1afp-176 }, + { 0x1.941518f17ca26p-181, 0x1.6f4367d03dbd8p-176 }, + { 0x1.41ee59ab3f625p-181, 0x1.24c114d62226p-176 }, + { 0x1.00733b2d2d2a7p-181, 0x1.d2aa649df6e65p-177 }, + { 0x1.9886bd6d1085bp-182, 0x1.73e63a45afd4dp-177 }, + { 0x1.455a452136a6p-182, 0x1.285756918be22p-177 }, + { 0x1.0314c07978175p-182, 0x1.d835dd5ba6335p-178 }, + { 0x1.9c91111b6c15fp-183, 0x1.782e2c1c97a81p-178 }, + { 0x1.4873499e69a71p-183, 0x1.2ba486638ab1ep-178 }, + { 0x1.0573c7a800f18p-183, 0x1.dd4be385e972p-179 }, + { 0x1.a030c72f0cf33p-184, 0x1.7c17c5d99552cp-179 }, + { 0x1.4b36ddfcc8743p-184, 0x1.2ea5f617d321fp-179 }, + { 0x1.078e5ec28bafdp-184, 0x1.e1e853589fe15p-180 }, + { 0x1.a362e51221b9fp-185, 0x1.7f9fd64579e1ap-180 }, + { 0x1.4da2bb75a5c65p-185, 0x1.3159306d0abdp-180 }, + { 0x1.0962c95c3eb5p-185, 0x1.e6076548c0765p-181 }, + { 0x1.a624c67aa97dfp-186, 0x1.82c376c3acddfp-181 }, + { 0x1.4fb4e0c13d49p-186, 0x1.33bbfc6dd55a6p-181 }, + { 0x1.0aef82f484486p-186, 0x1.e9a5b32d2ef52p-182 }, + { 0x1.a874210dbadcfp-187, 0x1.85800f4a2d262p-182 }, + { 0x1.516b94dabb86dp-187, 0x1.35cc607ce4fd8p-182 }, + { 0x1.0c33410fd4c56p-187, 0x1.ecc03cea2935dp-183 }, + { 0x1.aa4f078af0321p-188, 0x1.87d359f39448ep-183 }, + { 0x1.52c5696370c9dp-188, 0x1.3788a50e33e44p-183 }, + { 0x1.0d2cf5025ba2dp-188, 0x1.ef546c9652b0ap-184 }, + { 0x1.abb3ec79d594dp-189, 0x1.89bb66243bfd5p-184 }, + { 0x1.53c13ca08d951p-189, 0x1.38ef570827673p-184 }, + { 0x1.0ddbcd68fc943p-189, 0x1.f1601a115b514p-185 }, + { 0x1.aca1a45423b35p-190, 0x1.8b369b3c6ec4fp-185 }, + { 0x1.545e3b0f8838ap-190, 0x1.39ff49c7fe5e8p-185 }, + { 0x1.0e3f374dd9d68p-190, 0x1.f2e18e05495b4p-186 }, + { 0x1.ad1767288e013p-191, 0x1.8c43bad265564p-186 }, + { 0x1.549be08e15927p-191, 0x1.3ab798c59d4c2p-186 }, + { 0x1.0e56def61fbc4p-191, 0x1.f3d7844c8a592p-187 }, + { 0x1.ad14d1b2f0b5fp-192, 0x1.8ce1e26fb8214p-187 }, + { 0x1.5479f9137160bp-192, 0x1.3b17a8d383f04p-187 }, + { 0x1.0e22b05782284p-192, 0x1.f4412db819edfp-188 }, + { 0x1.ac99e5e7b9269p-193, 0x1.8d108ccedcd75p-188 }, + { 0x1.53f8a0f98a8b8p-193, 0x1.3b1f28f8795cap-188 }, + { 0x1.0da2d734853ffp-193, 0x1.f41e3132440dap-189 }, + { 0x1.aba70af1767bp-194, 0x1.8ccf9296410aep-189 }, + { 0x1.531844d58365ep-194, 0x1.3ace12e143377p-189 }, + { 0x1.0cd7bedf59779p-194, 0x1.f36eac3bc78c2p-190 }, + { 0x1.aa3d0ca096eedp-195, 0x1.8c1f2a8f92477p-190 }, + { 0x1.51d9a0dfd2e93p-195, 0x1.3a24aae988ae7p-190 }, + { 0x1.0bc211a3c2859p-195, 0x1.f23332c263066p-191 }, + { 0x1.a85d1a4e6bedcp-196, 0x1.8affe95ac6f2ap-191 }, + { 0x1.503dbfed30324p-196, 0x1.39237fbbcfa18p-191 }, + { 0x1.0a62b7d92f095p-196, 0x1.f06cce511da3ep-192 }, + { 0x1.a608c535a2ba1p-197, 0x1.8972c09d7f45cp-192 }, + { 0x1.4e45f9fa4adffp-197, 0x1.37cb698950bdap-192 }, + { 0x1.08bad69ed20a4p-197, 0x1.ee1cfc9be3df9p-193 }, + { 0x1.a341fe436d2d7p-198, 0x1.8778fdb058321p-193 }, + { 0x1.4bf3f24d273a5p-198, 0x1.361d88db2b95bp-193 }, + { 0x1.06cbce44363ecp-198, 0x1.eb45ad695330ap-194 }, + { 0x1.a00b13659be7cp-199, 0x1.851447ccc879bp-194 }, + { 0x1.4949952fc2371p-199, 0x1.341b44ff4c3c6p-194 }, + { 0x1.0497386163a39p-199, 0x1.e7e93fdecaep-195 }, + { 0x1.9c66ac5ae65b3p-200, 0x1.82469dbf1833ep-195 }, + { 0x1.464915486577bp-200, 0x1.31c64a141680ep-195 }, + { 0x1.021ee5a248c7fp-200, 0x1.e40a7f340982ap-196 }, + { 0x1.9857c70b8b2bcp-201, 0x1.7f125320f1e94p-196 }, + { 0x1.42f4e894cc71ap-201, 0x1.2f2086b6a5cf4p-196 }, + { 0x1.fec9b69351b7p-202, 0x1.dfac9ed4c27cep-197 }, + { 0x1.93e1b371520a1p-202, 0x1.7b7a0d21f0262p-197 }, + { 0x1.3f4fc50de840ap-202, 0x1.2c2c295822108p-197 }, + { 0x1.f8d6a0e0a9508p-203, 0x1.dad335f7aacdbp-198 }, + { 0x1.8f080f16c57cp-203, 0x1.7780bee4609a1p-198 }, + { 0x1.3b5c9cfaada16p-203, 0x1.28eb9d3f5000ap-198 }, + { 0x1.f269560bdbf92p-204, 0x1.d5823ab37d92ep-199 }, + { 0x1.89cec0363502dp-204, 0x1.7329a5753ca24p-199 }, + { 0x1.371e9af8e6ccfp-204, 0x1.2561873c1cc7ap-199 }, + { 0x1.eb86f931c309dp-205, 0x1.cfbdfc9b64d6ep-200 }, + { 0x1.8439f081b525ap-205, 0x1.6e7843670c8d2p-200 }, + { 0x1.32991dc38028ep-205, 0x1.2190c2136fc76p-200 }, + { 0x1.e434fdd743954p-206, 0x1.c98b1eed08258p-201 }, + { 0x1.7e4e079de1a2ep-206, 0x1.69705c180d6c1p-201 }, + { 0x1.2dcfb3be31ebdp-206, 0x1.1d7c5aaa0949p-201 }, + { 0x1.dc7920bafc5dcp-207, 0x1.c2ee925b3e3f6p-202 }, + { 0x1.780fa5599d558p-207, 0x1.6415eeac7f744p-202 }, + { 0x1.28c6164ec1235p-207, 0x1.19278bf59ff34p-202 }, + { 0x1.d459605b63623p-208, 0x1.bbed8e8100752p-203 }, + { 0x1.71839bad6a45bp-208, 0x1.5e6d30c67b96bp-203 }, + { 0x1.2380250c57526p-208, 0x1.1495babbc8d8ep-203 }, + { 0x1.cbdbf53eed588p-209, 0x1.b48d8b08c37b5p-204 }, + { 0x1.6aaee88d3a5e6p-209, 0x1.587a8905112ebp-204 }, + { 0x1.1e01e0cda0c0ep-209, 0x1.0fca71267dd26p-204 }, + { 0x1.c3074a0c1c67dp-210, 0x1.acd43894c1f06p-205 }, + { 0x1.6396af97c5f7fp-210, 0x1.52428954b7c2fp-205 }, + { 0x1.184f669e7e645p-210, 0x1.0ac95a364b406p-205 }, + { 0x1.b9e1f37f768c9p-211, 0x1.a4c779750fb77p-206 }, + { 0x1.5c4033ae88d94p-211, 0x1.4bc9e91b546a8p-206 }, + { 0x1.126ceaa621095p-211, 0x1.05963d1a5105bp-206 }, + { 0x1.b072a84d6770bp-212, 0x1.9c6d5a387a6d7p-207 }, + { 0x1.54b0d08180ac6p-212, 0x1.45157f4a2e598p-207 }, + { 0x1.0c5eb30658611p-212, 0x1.0034f87652744p-207 }, + { 0x1.a6c038fdf5aedp-213, 0x1.93cc0a254a9f5p-208 }, + { 0x1.4cedf419a9b38p-213, 0x1.3e2a3c60327aap-208 }, + { 0x1.062912bcc23f9p-213, 0x1.f552fb3e1c70bp-209 }, + { 0x1.9cd187cff951cp-214, 0x1.8ae9d3a6eb66fp-209 }, + { 0x1.44fd186d008c2p-214, 0x1.370d2466d3327p-209 }, + { 0x1.ffa0c91caab55p-215, 0x1.e9ef97aa04b46p-210 }, + { 0x1.92ad80b12a09bp-215, 0x1.81cd14bd535bbp-210 }, + { 0x1.3ce3bd0683046p-215, 0x1.2fc348f3a8121p-210 }, + { 0x1.f2b20c0b002abp-216, 0x1.de47d70b3398cp-211 }, + { 0x1.885b1157e885cp-216, 0x1.787c377ac34cdp-211 }, + { 0x1.34a760cc47acap-216, 0x1.2851c338b22e4p-211 }, + { 0x1.e58ea51580badp-217, 0x1.d263d33512bb6p-212 }, + { 0x1.7de1218b19542p-217, 0x1.6efdaa9c0e45ep-212 }, + { 0x1.2c4d7bed4d522p-217, 0x1.20bdae2cd61c6p-212 }, + { 0x1.d83f3d3e6d15p-218, 0x1.c64ba5bdb46dep-213 }, + { 0x1.73468ba3c29b8p-218, 0x1.6557da47246f7p-213 }, + { 0x1.23db7a001a935p-218, 0x1.190c20d5b5808p-213 }, + { 0x1.cacc668087b83p-219, 0x1.ba075f0192b6p-214 }, + { 0x1.689215536317fp-219, 0x1.5b9128fb09361p-214 }, + { 0x1.1b56b45aac06fp-219, 0x1.114228bb99133p-214 }, + { 0x1.bd3e92f58e3aep-220, 0x1.ad9efd6e7e35p-215 }, + { 0x1.5dca68b92a62fp-220, 0x1.51afe8bbb6b6cp-215 }, + { 0x1.12c46cab86e91p-220, 0x1.0964c48f92b05p-215 }, + { 0x1.af9e0c680145ap-221, 0x1.a11a652260dp-216 }, + { 0x1.52f60dcf5b39p-221, 0x1.47ba5483b6e8fp-216 }, + { 0x1.0a29c7db10f7p-221, 0x1.0178df0b67157p-216 }, + { 0x1.a1f2ec5b27de2p-222, 0x1.948157e97fbd7p-217 }, + { 0x1.481b643932becp-222, 0x1.3db68a0470a4fp-217 }, + { 0x1.018bc93b8e2e5p-222, 0x1.f306942454ae6p-218 }, + { 0x1.9445149305037p-223, 0x1.87db6da6dd3cap-218 }, + { 0x1.3d409d78b6819p-223, 0x1.33aa83bd4deabp-218 }, + { 0x1.f1de9c1ab95aap-224, 0x1.e311742f9561bp-219 }, + { 0x1.869c2824b4b6bp-224, 0x1.7b300d303ed2cp-219 }, + { 0x1.326bb792c8c5bp-224, 0x1.299c1370fc2d1p-219 }, + { 0x1.e0b212b870715p-225, 0x1.d31b83aa1a53bp-220 }, + { 0x1.78ff85165ac91p-225, 0x1.6e8665a634affp-220 }, + { 0x1.27a27826da7a5p-225, 0x1.1f90dcff1976ep-220 }, + { 0x1.cf9b0072f8176p-226, 0x1.c32d9c998168ap-221 }, + { 0x1.6b763e947db08p-226, 0x1.61e5684f4d137p-221 }, + { 0x1.1cea67fe8699cp-226, 0x1.158e51a7ac97ep-221 }, + { 0x1.bea20cad09b1fp-227, 0x1.b350464c51c99p-222 }, + { 0x1.5e0717c155a1cp-227, 0x1.5553c2fc66728p-222 }, + { 0x1.1248cf18568a2p-227, 0x1.0b99abbccdbb1p-222 }, + { 0x1.adcf760300963p-228, 0x1.a38baebfb68e4p-223 }, + { 0x1.50b87f214792dp-228, 0x1.48d7dafad7ffep-223 }, + { 0x1.07c2b12fe4dbap-228, 0x1.01b7eac5ea688p-223 }, + { 0x1.9d2b0d0c4a0b1p-229, 0x1.93e7a4bb0743p-224 }, + { 0x1.43908aa677d25p-229, 0x1.3c77c897ed254p-224 }, + { 0x1.fab995891c153p-230, 0x1.efdba02e2ceffp-225 }, + { 0x1.8cbc2fe600108p-230, 0x1.846b92a47c343p-225 }, + { 0x1.3694f45c1b92fp-230, 0x1.30395337f89bbp-225 }, + { 0x1.e6371d3dc0233p-231, 0x1.dc7fb7bbca8adp-226 }, + { 0x1.7c89c6867890ep-231, 0x1.751e7a10e8264p-226 }, + { 0x1.29cb17b0f706bp-231, 0x1.2421ee0211f87p-226 }, + { 0x1.d20647a807a0cp-232, 0x1.c9649548abac7p-227 }, + { 0x1.6c9a3fd812077p-232, 0x1.6606f00ed6d5dp-227 }, + { 0x1.1d37ef5f490cdp-232, 0x1.1836b52067807p-227 }, + { 0x1.be2ec88ae1479p-233, 0x1.b6922692e74d4p-228 }, + { 0x1.5cf38f9818abfp-233, 0x1.572b1a2c0293ap-228 }, + { 0x1.10e013ef486f7p-233, 0x1.0c7c6b93f06a1p-228 }, + { 0x1.aab7b734b99f6p-234, 0x1.a40fcadcdd133p-229 }, + { 0x1.4d9b2cf546b09p-234, 0x1.4890ac32b69b5p-229 }, + { 0x1.04c7bad04b57cp-234, 0x1.00f779993bbc1p-229 }, + { 0x1.97a78d5f1c6dbp-235, 0x1.91e450ac30542p-230 }, + { 0x1.3e9611e8218p-235, 0x1.3a3ce69b6a143p-230 }, + { 0x1.f1e56c0773bb7p-236, 0x1.eb57d7362f984p-231 }, + { 0x1.850426f2df55dp-236, 0x1.8015f467ddd4p-231 }, + { 0x1.2fe8bb3e4f4d8p-236, 0x1.2c3495adab7d8p-231 }, + { 0x1.dac8e8a813f1fp-237, 0x1.d53ae35dbfa26p-232 }, + { 0x1.72d2c2a7422abp-237, 0x1.6eaa5fce4af3ap-232 }, + { 0x1.21972950f570dp-237, 0x1.1e7c114a57a33p-232 }, + { 0x1.c44004226dc17p-238, 0x1.bf9ebf2ac34cfp-233 }, + { 0x1.6118037139874p-238, 0x1.5da6aa3adb7a3p-233 }, + { 0x1.13a4e15d42467p-238, 0x1.11173d5813f4dp-233 }, + { 0x1.ae501496e23f2p-239, 0x1.aa895a750e0f6p-234 }, + { 0x1.4fd7f2b705e64p-239, 0x1.4d0f59b16ac32p-234 }, + { 0x1.0614ef7575b09p-239, 0x1.04098aca1b898p-234 }, + { 0x1.98fdb1084fd1cp-240, 0x1.95ffef5a788b3p-235 }, + { 0x1.3f16033b4da17p-240, 0x1.3ce864a4f75bbp-235 }, + { 0x1.f1d3d20014dd3p-241, 0x1.eeabf27142ccbp-236 }, + { 0x1.844cb59a101a9p-241, 0x1.82070510e6e91p-236 }, + { 0x1.2ed514b22b68bp-241, 0x1.2d35346de60f3p-236 }, + { 0x1.d84bdf7421499p-242, 0x1.d5fe3202b4d44p-237 }, + { 0x1.7040489842ad7p-242, 0x1.6ea2738b3dbebp-237 }, + { 0x1.1f1777f205012p-242, 0x1.1df8a8637ba9cp-237 }, + { 0x1.bf956a62adf73p-243, 0x1.be0e1bcc5bf2bp-238 }, + { 0x1.5cdae0381ff94p-243, 0x1.5bd567e120a1cp-238 }, + { 0x1.0fdef3b187063p-243, 0x1.0f35198b8b7f7p-238 }, + { 0x1.a7b2fd5556b6ap-244, 0x1.a6df243f2c6f4p-239 }, + { 0x1.4a1e48fd99b8ep-244, 0x1.49a26968a8fd1p-239 }, + { 0x1.012cc9c3d142ap-244, 0x1.00ec5ed2dbe3ep-239 }, + { 0x1.90a652d08b6ecp-245, 0x1.9073f3afbdfebp-240 }, + { 0x1.380bacb3471d9p-245, 0x1.380b5f70c487dp-240 }, + { 0x1.e603798765b0ap-246, 0x1.e63fa380d130bp-241 }, + { 0x1.7a705e88ab4c8p-246, 0x1.7ace6e086aab7p-241 }, + { 0x1.26a399e180e7cp-246, 0x1.2711978a97cf7p-241 }, + { 0x1.cabc2c3d98d7cp-247, 0x1.cba0a72ae9c08p-242 }, + { 0x1.651157275ac6fp-247, 0x1.65efbb20adf2dp-242 }, + { 0x1.15e60bb1a2bacp-247, 0x1.16b5cc5019368p-242 }, + { 0x1.b08358e30e1b1p-248, 0x1.b1fca598944c3p-243 }, + { 0x1.5088c08941b89p-248, 0x1.51d84fa353951p-243 }, + { 0x1.05d2722aa0abep-248, 0x1.06f82c9619b9p-243 }, + { 0x1.9757d44a0d5d1p-249, 0x1.9953a1cf16aadp-244 }, + { 0x1.3cd5765cc7b51p-249, 0x1.3e87f66d27bbp-244 }, + { 0x1.eccf7568ff3afp-250, 0x1.efb0c5f0312cdp-245 }, + { 0x1.7f37a88128933p-250, 0x1.81a4d1085cfd1p-245 }, + { 0x1.29f5b70afae6ep-250, 0x1.2bfdda4e2b20cp-245 }, + { 0x1.cf48b1a182cb9p-251, 0x1.d2ab3b59164a6p-246 }, + { 0x1.682022c0d8296p-251, 0x1.6aeea740e7e26p-246 }, + { 0x1.17e72ed48d1c2p-251, 0x1.1a389017ca93cp-246 }, + { 0x1.b30c9decefa86p-252, 0x1.b6dd2d215fccfp-247 }, + { 0x1.520de188c8ff4p-252, 0x1.552ee415230cdp-247 }, + { 0x1.06a7030db71fbp-252, 0x1.093620e33d9f9p-247 }, + { 0x1.98166f02e00aap-253, 0x1.9c4336b720df7p-248 }, + { 0x1.3cfce2d301755p-253, 0x1.40629fd47fda6p-248 }, + { 0x1.ec63bac9af50ap-254, 0x1.f1e828f7f1e6ep-249 }, + { 0x1.7e609b497d4bfp-254, 0x1.82d92bd0fbc5bp-249 }, + { 0x1.28e89244647b5p-254, 0x1.2c8658b1c7fabp-249 }, + { 0x1.cd07ee41894f6p-255, 0x1.d2def7b6139fbp-250 }, + { 0x1.65e4eca3c47cep-255, 0x1.6a9a29142865ap-250 }, + { 0x1.15cbd7439af48p-255, 0x1.1995fff959855p-250 }, + { 0x1.af324889fe32ep-256, 0x1.b549f742691f7p-251 }, + { 0x1.4e9c920d5db05p-256, 0x1.5380a4af4c2e9p-251 }, + { 0x1.03a122e1077b7p-256, 0x1.078d07375b0bp-251 }, + { 0x1.92d9bd168c63p-257, 0x1.9921acfd99f39p-252 }, + { 0x1.388030ea8589cp-257, 0x1.3d867ecfb60a5p-252 }, + { 0x1.e4c4faf832008p-258, 0x1.ecccda72dba49p-253 }, + { 0x1.77f4a046c515ep-258, 0x1.7e5deef2de87bp-253 }, + { 0x1.2387f5f4b712ep-258, 0x1.28a511d87ce7dp-253 }, + { 0x1.c413282821079p-259, 0x1.cc3995b1e2c4p-254 }, + { 0x1.5e78bc56d0fbbp-259, 0x1.64f5f80200f46p-254 }, + { 0x1.0faba5af01355p-259, 0x1.14d5424501d7ep-254 }, + { 0x1.a51f8a6830159p-260, 0x1.ad54bef9112dp-255 }, + { 0x1.465b65a83bdbbp-260, 0x1.4ce07b8d50856p-255 }, + { 0x1.f9c5589e7201fp-261, 0x1.020f8e226943ep-255 }, + { 0x1.87dc5ad8af9ecp-261, 0x1.90123a8271991p-256 }, + { 0x1.2f918e4d3f95cp-261, 0x1.3613b89391a8fp-256 }, + { 0x1.d6485a170413ap-262, 0x1.e098381b76cd3p-257 }, + { 0x1.6c3b66970be3dp-262, 0x1.7465697a54c64p-257 }, + { 0x1.1a0fd8c3a4e6fp-262, 0x1.20858c20a1795p-257 }, + { 0x1.b4ce217bd5e55p-263, 0x1.bf05934cfa1ccp-258 }, + { 0x1.522e259c7017ap-263, 0x1.5a41409f84e49p-258 }, + { 0x1.05caa9cf257c4p-263, 0x1.0c2b83023243dp-258 }, + { 0x1.954427a430b11p-264, 0x1.9f5672cf62a4fp-259 }, + { 0x1.39a5d07601e71p-264, 0x1.41985de8f7a14p-259 }, + { 0x1.e56c72cc01fccp-265, 0x1.f1f5d5615d783p-260 }, + { 0x1.7797a6e64ddc9p-265, 0x1.8179bfb69c631p-260 }, + { 0x1.229374c83806p-265, 0x1.2a5d1d1f1ae5cp-260 }, + { 0x1.c18d454a503aep-266, 0x1.cdd1c2bddbb9ep-261 }, + { 0x1.5bb5b3e414ad3p-266, 0x1.655e203c78adp-261 }, + { 0x1.0ce808921de57p-266, 0x1.1481ab5a1469ap-261 }, + { 0x1.9fdfe587f056ap-267, 0x1.abd4ca4bd8884p-262 }, + { 0x1.418b54bd6a895p-267, 0x1.4af20f59f283dp-262 }, + { 0x1.f128f851039d9p-268, 0x1.fff032b2dbde7p-263 }, + { 0x1.804c6e03f60cbp-268, 0x1.8be8c488684b4p-263 }, + { 0x1.290596a08a94fp-268, 0x1.3223f2e5be0fp-263 }, + { 0x1.cb1395c8187f6p-269, 0x1.d964d959533d1p-264 }, + { 0x1.62bb1316ec5fcp-269, 0x1.6df780d5ecc43p-264 }, + { 0x1.1211a1b47d3aep-269, 0x1.1ae2302fd4bcdp-264 }, + { 0x1.a772150026811p-270, 0x1.b5455f4e2ce45p-265 }, + { 0x1.47143aa78b5fep-270, 0x1.51eade2a24279p-265 }, + { 0x1.f93996ba5e93dp-271, 0x1.051b3f15282e5p-265 }, + { 0x1.8626f2553e204p-271, 0x1.93760037df87ap-266 }, + { 0x1.2d4091cd12adcp-271, 0x1.37ace1ccc1a8dp-266 }, + { 0x1.d1294db79df79p-272, 0x1.e17b7713cf17fp-267 }, + { 0x1.6715149108678p-272, 0x1.73db39c4b278bp-267 }, + { 0x1.1529206516167p-272, 0x1.1f27cc2724f9p-267 }, + { 0x1.abce28a1f17f2p-273, 0x1.bb70eb3792a1cp-268 }, + { 0x1.4a1fe3e55f964p-273, 0x1.5659e4463ddd1p-268 }, + { 0x1.fd6eb54be7326p-274, 0x1.08462ba9624dbp-268 }, + { 0x1.89049c51b8388p-274, 0x1.97f4ffe1284a1p-269 }, + { 0x1.2f2b5e6789756p-274, 0x1.3ad748e88c53fp-269 }, + { 0x1.d3aa617478594p-275, 0x1.e5e5db98318a5p-270 }, + { 0x1.68a9e9f7b2f9ap-275, 0x1.76e6798f53e9ap-270 }, + { 0x1.161c2a1de488ep-275, 0x1.21393590da64bp-270 }, + { 0x1.acda38e82463bp-276, 0x1.be32dc731f12cp-271 }, + { 0x1.4a9c33e05809ap-276, 0x1.5824d30f3fce1p-271 }, + { 0x1.fdaf4969fc45p-277, 0x1.09660e736b8bdp-271 }, + { 0x1.88d45a53c41c5p-277, 0x1.994b0856743cbp-272 }, + { 0x1.2eba8f55fe897p-277, 0x1.3b9051c5e7679p-272 }, + { 0x1.d287e1e77c85ap-278, 0x1.e689bae600601p-273 }, + { 0x1.6770239fc87e6p-278, 0x1.77071c1633b26p-273 }, + { 0x1.14e513c1b20dcp-278, 0x1.210a174166fcdp-273 }, + { 0x1.aa90041143186p-279, 0x1.bd7abebe480e6p-274 }, + { 0x1.488642c71cfa6p-279, 0x1.5740f6d4ed277p-274 }, + { 0x1.f9f9ce5a157bbp-280, 0x1.0874302ee34fdp-274 }, + { 0x1.85974997b931fp-280, 0x1.97701e51a6bfep-275 }, + { 0x1.2bf0c37efc00bp-280, 0x1.39d3aac239fe2p-275 }, + { 0x1.cdc89092e43c3p-281, 0x1.e36341a88ea0cp-276 }, + { 0x1.636f0e2785c54p-281, 0x1.743c5e4db43f9p-276 }, + { 0x1.118b19def65f8p-281, 0x1.1e9b8ad36fd99p-276 }, + { 0x1.a4fd2c459c71p-282, 0x1.b94cde5e4fc3p-277 }, + { 0x1.43ea7a73d5cfp-282, 0x1.53b3a109a94aep-277 }, + { 0x1.f26454740b953p-283, 0x1.057635a1ed1dfp-277 }, + { 0x1.7f60ab495565cp-283, 0x1.926f55b776f91p-278 }, + { 0x1.26de8be09d876p-283, 0x1.35abb1f1cadefp-278 }, + { 0x1.c5889cb51dbb9p-284, 0x1.dc853b381e5ap-279 }, + { 0x1.5cbe6a335189cp-284, 0x1.6e96e5d005f5dp-279 }, + { 0x1.0c22190c33c65p-284, 0x1.19fc0dba0e848p-279 }, + { 0x1.9c42b0a7816acp-285, 0x1.b1c21d6e11086p-280 }, + { 0x1.3ce41b9a97542p-285, 0x1.4d91f3701143cp-280 }, + { 0x1.e71ba6efe048bp-286, 0x1.007de792cfd6ep-280 }, + { 0x1.76552635a3b27p-286, 0x1.8a6663a0ececbp-281 }, + { 0x1.1fa1c7f04e719p-286, 0x1.2f310e41037d6p-281 }, + { 0x1.b9f88d1e59fb3p-287, 0x1.d2185735c5ad9p-282 }, + { 0x1.538582347c59ep-287, 0x1.66381bdd98a02p-282 }, + { 0x1.04c9ca3c242adp-287, 0x1.1346f1ba5a69ap-282 }, + { 0x1.9093a8968bba5p-288, 0x1.a706fd9470fb8p-283 }, + { 0x1.339c31e0d51b7p-288, 0x1.45000f1eec014p-283 }, + { 0x1.d8619415342d3p-289, 0x1.f3510620184eap-284 }, + { 0x1.6aa95f63dd017p-289, 0x1.7f84791f6fdbbp-284 }, + { 0x1.16648113f6ec6p-289, 0x1.2689bc620188bp-284 }, + { 0x1.ab5b65b277be7p-290, 0x1.c45998d7521aep-285 }, + { 0x1.47f9aad3382fep-290, 0x1.5b50e4b7d6356p-285 }, + { 0x1.f7591b1b1c875p-291, 0x1.0aa3508d5dbp-285 }, + { 0x1.82335294ba26p-291, 0x1.9959eb6f64db6p-286 }, + { 0x1.2848053b7dfb1p-291, 0x1.3a2fb2a16d1ccp-286 }, + { 0x1.c68a6f5a8ef62p-292, 0x1.e23b370697cbbp-287 }, + { 0x1.5c9ffcce7e5fdp-292, 0x1.720876851d9fbp-287 }, + { 0x1.0b5b54d487d35p-292, 0x1.1be79c992aff6p-287 }, + { 0x1.9a0421e5c5d71p-293, 0x1.b3980569c43a5p-288 }, + { 0x1.3a5c4268d4e27p-293, 0x1.4e1fc4f822568p-288 }, + { 0x1.e1fba80d34a41p-294, 0x1.0042910b94342p-288 }, + { 0x1.7172912ec21f8p-294, 0x1.8908e30f7a1b3p-289 }, + { 0x1.1b271db151968p-294, 0x1.2d5e5a1b8288ep-289 }, + { 0x1.b1f9ef2d6b135p-295, 0x1.ce1b3b9ea6267p-290 }, + { 0x1.4c872d1af92bcp-295, 0x1.623e8fb994f23p-290 }, + { 0x1.fd87064e02a6fp-296, 0x1.0f8695160ca38p-290 }, + { 0x1.8652a61cdcd3bp-296, 0x1.a031b186be289p-291 }, + { 0x1.2af84a660968dp-296, 0x1.3eee8e04dc3ap-291 }, + { 0x1.c9f07af149226p-297, 0x1.e8bd23cc416fp-292 }, + { 0x1.5eacf76fffc0cp-297, 0x1.766e8d5583265p-292 }, + { 0x1.0c80f3efbbf3fp-297, 0x1.1ed2fab014c43p-292 }, + { 0x1.9b1f8ffd8f3c8p-298, 0x1.b76010ebb6c6ap-293 }, + { 0x1.3ab5d5023fe4ap-298, 0x1.507d813502ab7p-293 }, + { 0x1.e1c174ea2aaa6p-299, 0x1.01aa61c90eaccp-293 }, + { 0x1.70b05029068dap-299, 0x1.8a90544ab274dp-294 }, + { 0x1.1a1fba21de5fp-299, 0x1.2e0fb0911dd84p-294 }, + { 0x1.afb70654af059p-300, 0x1.ce6f24739f7c7p-295 }, + { 0x1.4a458b53b2a84p-300, 0x1.61eefc532711fp-295 }, + { 0x1.f944d95c81983p-301, 0x1.0edb77098a96p-295 }, + { 0x1.8272ab43f7156p-301, 0x1.9e82e04d9025fp-296 }, + { 0x1.278886c5a4d73p-301, 0x1.3d237a2e0f859p-296 }, + { 0x1.c3f57b512a1f2p-302, 0x1.e5385c7d0efep-297 }, + { 0x1.598c52c5d1746p-302, 0x1.73258d0b919ebp-297 }, + { 0x1.0828ad1da0983p-302, 0x1.1bdb57d01ceccp-297 }, + { 0x1.93d4935512f54p-303, 0x1.b223e5e67d24ap-298 }, + { 0x1.34a3670d3cd59p-303, 0x1.4bf43098a2ef1p-298 }, + { 0x1.d7b67cefff216p-304, 0x1.fb93db1e39a21p-299 }, + { 0x1.686e7356020d2p-304, 0x1.8402d3eada60ap-299 }, + { 0x1.135e695d6d4f8p-304, 0x1.2892e3159736p-299 }, + { 0x1.a4b6028e1ae52p-305, 0x1.c5502f868f04bp-300 }, + { 0x1.415808da66669p-305, 0x1.5a670a5d83e0ep-300 }, + { 0x1.ead51e60a821dp-306, 0x1.08ac71830fd4ep-300 }, + { 0x1.76cfe88ffbfa7p-306, 0x1.9467d9d3bce7dp-301 }, + { 0x1.1e2e61d740a91p-306, 0x1.34ea92731d6fp-301 }, + { 0x1.b4f6c22875415p-307, 0x1.d7e402cf49a21p-302 }, + { 0x1.4d8e03e448998p-307, 0x1.6860e96265ba8p-302 }, + { 0x1.fd2c6816f010bp-308, 0x1.132f279000564p-302 }, + { 0x1.8494b75728df1p-308, 0x1.a4356bd52863ep-303 }, + { 0x1.28836b62851b4p-308, 0x1.40cac092d16a6p-303 }, + { 0x1.c476ceb4ce0a6p-309, 0x1.e9bb8c8c45eaap-304 }, + { 0x1.592d26553a529p-309, 0x1.75c6ad9777c96p-304 }, + { 0x1.074be65f60432p-309, 0x1.1d3d889242361p-304 }, + { 0x1.91a14719373e5p-310, 0x1.b34c7bf3e0108p-305 }, + { 0x1.3248b33f78dd9p-310, 0x1.4c1bf325b5886p-305 }, + { 0x1.d316bfa6ecf07p-311, 0x1.fab351a6d7271p-306 }, + { 0x1.641dc398561efp-311, 0x1.827d8b273a859p-306 }, + { 0x1.0f79d08c027e2p-311, 0x1.26c35a8453a6ep-306 }, + { 0x1.9ddabce45ff88p-312, 0x1.c18e854f7a653p-307 }, + { 0x1.3b6a0443345f1p-312, 0x1.56c727238c10ep-307 }, + { 0x1.e0b830517633fp-313, 0x1.05545196af9e3p-307 }, + { 0x1.6e4903f595976p-313, 0x1.8e6b62ae03487p-308 }, + { 0x1.170eca4e7a4cap-313, 0x1.2facf384d3a3bp-308 }, + { 0x1.a92756c27d93ap-314, 0x1.ceddf1e753b81p-309 }, + { 0x1.43d40bf74392dp-314, 0x1.60b61e0028436p-309 }, + { 0x1.ed3e286c4c0dep-315, 0x1.0cbd09b1e5e1p-309 }, + { 0x1.77993389df313p-315, 0x1.997719e8b73a8p-310 }, + { 0x1.1dfa945eaae99p-315, 0x1.37e77cf85ca37p-310 }, + { 0x1.b36ec5aa0588p-316, 0x1.db1e802a6c81fp-311 }, + { 0x1.4b749e64b35f5p-316, 0x1.69d3aa6fccfd9p-311 }, + { 0x1.f88d823260c9ep-317, 0x1.1383f4dd09079p-311 }, + { 0x1.7ffa0f1fabb65p-317, 0x1.a388f33976b7bp-312 }, + { 0x1.242e12375b352p-317, 0x1.3f613589599c6p-312 }, + { 0x1.bc9a844ffd2b5p-318, 0x1.e635a66e3ebe7p-313 }, + { 0x1.523af73f84783p-318, 0x1.720bfb4a981d7p-313 }, + { 0x1.0146a610e0588p-318, 0x1.199a49bcc51p-313 }, + { 0x1.87590d6d36008p-319, 0x1.ac8ae259e160cp-314 }, + { 0x1.299b80ea6bb7fp-319, 0x1.4609b0c4183cap-314 }, + { 0x1.c496292aa266bp-320, 0x1.f00af26520f9dp-315 }, + { 0x1.5817f72c95e4cp-320, 0x1.794ce31e24c7bp-315 }, + { 0x1.059392396d038p-320, 0x1.1ef2877dbfcadp-315 }, + { 0x1.8da5a346cbb3fp-321, 0x1.b468dc95cb829p-316 }, + { 0x1.2e36a9eb80d32p-321, 0x1.4bd213115ac94p-316 }, + { 0x1.cb4fb203e18ap-322, 0x1.f88862b544527p-317 }, + { 0x1.5cfe5be9615c7p-322, 0x1.7f861b04cbe3ap-317 }, + { 0x1.0923c6394f695p-322, 0x1.2380a7a548a2fp-317 }, + { 0x1.92d18166ccd51p-323, 0x1.bb1122f6e5762p-318 }, + { 0x1.31f510cb3f507p-323, 0x1.50ad48dd9b3a6p-318 }, + { 0x1.d0b7c794af438p-324, 0x1.ff9ab8e5d6631p-319 }, + { 0x1.60e2f23228dedp-324, 0x1.84a97f6b3e853p-319 }, + { 0x1.0bef1906dac58p-324, 0x1.273a4b16ba84fp-319 }, + { 0x1.96d0ca88e4fcp-325, 0x1.c07484e1da469p-320 }, + { 0x1.34ce1af3c1b6p-325, 0x1.549037ceef1fep-320 }, + { 0x1.d4c1f7c67dd18p-326, 0x1.0298e0fc06037p-320 }, + { 0x1.63bcc0600e3b1p-326, 0x1.88ab45875f419p-321 }, + { 0x1.0def17046c37ep-326, 0x1.2a16e161fa35fp-321 }, + { 0x1.999a40ba75f42p-327, 0x1.c48699c75f345p-322 }, + { 0x1.36bb3093bcf7fp-327, 0x1.5771e906a9978p-322 }, + { 0x1.d764e5657aa2p-328, 0x1.04a04a1699caap-322 }, + { 0x1.658528dc53bd5p-328, 0x1.8b822865b44e6p-323 }, + { 0x1.0f1f1acd583cp-328, 0x1.2c0fc98ac934cp-323 }, + { 0x1.9b2768ee2e28p-329, 0x1.c73df0b6d4334p-324 }, + { 0x1.37b7d60833afbp-329, 0x1.594bab8ddacb1p-324 }, + { 0x1.d89a6c43f4c1p-330, 0x1.05dee05833b3cp-324 }, + { 0x1.663803afd90e2p-330, 0x1.8d278c9cbfc58p-325 }, + { 0x1.0f7c5f2e4265p-330, 0x1.2d206b997c2ccp-325 }, + { 0x1.9b74a41343d69p-331, 0x1.c89434d36542fp-326 }, + { 0x1.37c1bd3bb9cfep-331, 0x1.5a192e33cf627p-326 }, + { 0x1.d85fb90bdf218p-332, 0x1.0651bc0c61b2p-326 }, + { 0x1.65d3aea4b609ep-332, 0x1.8d9799e5f2521p-327 }, + { 0x1.0f0609e7aa674p-332, 0x1.2d464a6b30dc2p-327 }, + { 0x1.9a813d2878f74p-333, 0x1.c88645e6c88eep-328 }, + { 0x1.36d8ce9d2217bp-333, 0x1.59d89052b0525p-328 }, + { 0x1.d6b5543d3c94p-334, 0x1.05f7d07f3fb02p-328 }, + { 0x1.645913a262a36p-334, 0x1.8cd14a1185c8dp-329 }, + { 0x1.0dbd2f003b6a5p-334, 0x1.2c810d60e767ep-329 }, + { 0x1.984f6bfe6778p-335, 0x1.c714448c370a6p-330 }, + { 0x1.34ff297cd534dp-335, 0x1.588a691f2cd1fp-330 }, + { 0x1.d39f201da2255p-336, 0x1.04d1f01416963p-330 }, + { 0x1.61cba521cabb4p-336, 0x1.8ad66d03eba59p-331 }, + { 0x1.0ba4cc94c45b3p-336, 0x1.2ad281b8cc2ap-331 }, + { 0x1.94e44c9a075e7p-337, 0x1.c44191b160ec2p-332 }, + { 0x1.32391bcecdc03p-337, 0x1.5631c55b5d22cp-332 }, + { 0x1.cf2449a3fda4bp-338, 0x1.02e2c911c7929p-332 }, + { 0x1.5e3150cc8eda4p-338, 0x1.87aba1a7120bfp-333 }, + { 0x1.08c1bf3c985fap-338, 0x1.283e938a586f7p-333 }, + { 0x1.9047cb663bb8cp-339, 0x1.c014c17012593p-334 }, + { 0x1.2e8d117dfdd44p-339, 0x1.52d41b7968429p-334 }, + { 0x1.c94f2cb2815a8p-340, 0x1.002edb3674f27p-334 }, + { 0x1.599268900e7bcp-340, 0x1.835843f5f0b0cp-335 }, + { 0x1.051aaf415041dp-340, 0x1.24cb3e8b7d756p-335 }, + { 0x1.8a84869fc8267p-341, 0x1.ba9781881c8a9p-336 }, + { 0x1.2a037bab743e1p-341, 0x1.4e79366e7a47p-336 }, + { 0x1.c22d2c350e306p-342, 0x1.f978cc962d426p-337 }, + { 0x1.53f982a03a248p-342, 0x1.7de65083f0e21p-337 }, + { 0x1.00b7f70f68972p-342, 0x1.208076f18ea3p-337 }, + { 0x1.83a7a5a0b9d4dp-343, 0x1.b3d6740403453p-338 }, + { 0x1.24a6b05eb3edap-343, 0x1.492b17a8d9ad4p-338 }, + { 0x1.b9ce7efad864cp-344, 0x1.f126a42ab2a64p-339 }, + { 0x1.4d7351162fad8p-344, 0x1.77623e1a3ca2fp-339 }, + { 0x1.f74706d1f613cp-345, 0x1.1b680aeae0c3cp-339 }, + { 0x1.7bc0a6e57fbc5p-345, 0x1.abe0fed214bcap-340 }, + { 0x1.1e82c35430e3dp-345, 0x1.42f5d0cb0afebp-340 }, + { 0x1.b045f25c98b4bp-346, 0x1.e77a20528f8f5p-341 }, + { 0x1.460e7202036c7p-346, 0x1.6fdace394b03cp-341 }, + { 0x1.ebd15c07c2acdp-347, 0x1.158d7d54f1681p-341 }, + { 0x1.72e125d540295p-347, 0x1.a2c9115542385p-342 }, + { 0x1.17a558b9c184fp-347, 0x1.3be755f8b210cp-342 }, + { 0x1.a5a8a3f3de092p-348, 0x1.dc88f077bd369p-343 }, + { 0x1.3ddb38ecb5b52p-348, 0x1.6760d57bb9982p-343 }, + { 0x1.df2826b036578p-349, 0x1.0efdda755dbb3p-343 }, + { 0x1.691c997f37f0ep-349, 0x1.98a2e123c782ep-344 }, + { 0x1.101d72c627ff7p-349, 0x1.340f49a72211p-344 }, + { 0x1.9a0db3d2b8dacp-350, 0x1.d06b3f65f6fdp-345 }, + { 0x1.34eb72e63e592p-350, 0x1.5e06fcff790f4p-345 }, + { 0x1.d166c8f34fca4p-351, 0x1.07c787991a68p-345 }, + { 0x1.5e880d9f1fe43p-351, 0x1.8d849f54265f7p-346 }, + { 0x1.07fb3b2ff1602p-351, 0x1.2b7ec30262d2bp-346 }, + { 0x1.8d8df0cbffd52p-352, 0x1.c33b5a8ad639fp-347 }, + { 0x1.2b52265317648p-352, 0x1.53e17e1a8afadp-347 }, + { 0x1.c2aa6bd34f17bp-353, 0x1.fff41d2913dabp-348 }, + { 0x1.5339d751ff2a1p-353, 0x1.818627da2e9e4p-348 }, + { 0x1.fe9f93308c405p-354, 0x1.2248100f21115p-348 }, + { 0x1.80438073219dep-354, 0x1.b515531d535ebp-349 }, + { 0x1.21234fbc4a127p-354, 0x1.4905d9b84e0cbp-349 }, + { 0x1.b31198aa5f8abp-355, 0x1.ef4bcc5f71a72p-350 }, + { 0x1.474946f304456p-355, 0x1.74c0ac8d03b2bp-350 }, + { 0x1.ec59d00f3fe38p-356, 0x1.187e74c209a91p-350 }, + { 0x1.7249848679fa9p-356, 0x1.a6169b09c4411p-351 }, + { 0x1.16739cec78bd4p-356, 0x1.3d8a8ccb26cd9p-351 }, + { 0x1.a2bbd0795adeep-357, 0x1.ddb87127c2076p-352 }, + { 0x1.3ace589cd3352p-357, 0x1.674e5d7be735cp-352 }, + { 0x1.d949ad392f075p-358, 0x1.0e35e84d33d3fp-352 }, + { 0x1.63bbbf78651ccp-358, 0x1.965d9f895d99cp-353 }, + { 0x1.0b5827a3ba382p-358, 0x1.3186c3440696p-353 }, + { 0x1.91c922f9ee4cp-359, 0x1.cb5d51a48d7d4p-354 }, + { 0x1.2de164c74e725p-359, 0x1.594a1039f0199p-354 }, + { 0x1.c5941f108d9d1p-360, 0x1.0382d1e479246p-354 }, + { 0x1.54b639c219649p-360, 0x1.8609634a384ccp-355 }, + { 0x1.ffcc62473097ap-361, 0x1.25120afe02122p-355 }, + { 0x1.8059c757355aep-361, 0x1.b85e31314f4b4p-356 }, + { 0x1.209ad26ca18d9p-361, 0x1.4acee7c0fcbafp-356 }, + { 0x1.b15e18d0d2d12p-362, 0x1.f0f38c6449ad9p-357 }, + { 0x1.4554e9983b016p-362, 0x1.753919ff4b182p-357 }, + { 0x1.e865bf893f8f4p-363, 0x1.1844080030d76p-357 }, + { 0x1.6e8db855aac9ap-363, 0x1.a4dede3a3eb93p-358 }, + { 0x1.1312cc0ae5d04p-363, 0x1.3bf7fe7aa33ap-358 }, + { 0x1.9ccc1bfbf7ecbp-364, 0x1.da5e8d4d639edp-359 }, + { 0x1.35b35e7d0088ep-364, 0x1.640bc7176cda7p-359 }, + { 0x1.d0a5ff60b92cfp-365, 0x1.0b342b640cc13p-359 }, + { 0x1.5c84558f35d95p-365, 0x1.9102c47629cb9p-360 }, + { 0x1.0560f8bafb2c7p-365, 0x1.2ce013e375d0fp-360 }, + { 0x1.8801ce509ea26p-366, 0x1.c36f07720a932p-361 }, + { 0x1.25ec7207b3c64p-366, 0x1.529fe13854ed9p-361 }, + { 0x1.b8b58f7c67c36p-367, 0x1.fbf2dc269c35dp-362 }, + { 0x1.4a5c0b3b7424dp-367, 0x1.7cec854a40ddcp-362 }, + { 0x1.ef3874e46141bp-368, 0x1.1da13f1aaaee6p-362 }, + { 0x1.732197e24d857p-368, 0x1.ac4c46230c45cp-363 }, + { 0x1.1619ff0ea7ec6p-368, 0x1.4112fbeff8a1fp-363 }, + { 0x1.a0bb46a0a2c53p-369, 0x1.e15420dda8758p-364 }, + { 0x1.383201c8ba71ap-369, 0x1.68bd97eb5b05dp-364 }, + { 0x1.d3b4e4b894768p-370, 0x1.0e54a78756b6bp-364 }, + { 0x1.5e4c4aaef013p-370, 0x1.951c14f527745p-365 }, + { 0x1.0654a030d3e7p-370, 0x1.2f8178dd14a04p-365 }, + { 0x1.88dc03d1ca801p-371, 0x1.c6b6bf9361ee4p-366 }, + { 0x1.2621d65152a67p-371, 0x1.5495f2949c65ep-366 }, + { 0x1.b860981f4834ap-372, 0x1.fe24891c8ca0cp-367 }, + { 0x1.49a0d4c97c281p-372, 0x1.7e02609a87253p-367 }, + { 0x1.ed66ed1143993p-373, 0x1.1e064158c947bp-367 }, + { 0x1.713a5a10cc9bp-373, 0x1.ac4304f253262p-368 }, + { 0x1.14455cbbff469p-373, 0x1.4093bdea6e36fp-368 }, + { 0x1.9d62205df47a6p-374, 0x1.dfe14a435c3c2p-369 }, + { 0x1.353bfdeb15aa4p-374, 0x1.6720e3d624fdcp-369 }, + { 0x1.ce97f23783a55p-375, 0x1.0cba8970a9d66p-369 }, + { 0x1.59f649793ea9ap-375, 0x1.921e961b81171p-370 }, + { 0x1.02b46c188f22dp-375, 0x1.2cd3135c626d1p-370 }, + { 0x1.82dcfdba2d59cp-376, 0x1.c2097f7f7c953p-371 }, + { 0x1.213830f44d648p-376, 0x1.5096e15b063dbp-371 }, + { 0x1.b0639acae41c7p-377, 0x1.f76b39886a20dp-372 }, + { 0x1.432d063e4cc5ap-377, 0x1.786c2636e4e2ap-372 }, + { 0x1.e3096b161ade1p-378, 0x1.196dc712e8651p-372 }, + { 0x1.68f1646f450ccp-378, 0x1.a4c39680abb0bp-373 }, + { 0x1.0dad51a121c5fp-378, 0x1.3a80eb1934625p-373 }, + { 0x1.92ed52465cf13p-379, 0x1.d6196b3830612p-374 }, + { 0x1.2cf8cdb32b26dp-379, 0x1.5f4b3b930a91ap-374 }, + { 0x1.c1934bb7035c1p-380, 0x1.067b3db09279ep-374 }, + { 0x1.4fbc11c19c0b7p-380, 0x1.8832413bcb6f5p-375 }, + { 0x1.f5613cdc1ad52p-381, 0x1.24f8b72bbd6eep-375 }, + { 0x1.76547ab0f816ap-381, 0x1.b5a5bcacf14ddp-376 }, + { 0x1.1770c93ef3136p-381, 0x1.46d8046ba690cp-376 }, + { 0x1.a128a30d837ebp-382, 0x1.e8209bd7c6d4dp-377 }, + { 0x1.375630e92b79p-382, 0x1.6c744b66f6406p-377 }, + { 0x1.d0a93cd8add1ep-383, 0x1.1015024fefc8dp-377 }, + { 0x1.5ab4549d6cf15p-383, 0x1.9631ba1694964p-378 }, + { 0x1.02a8fed4a1944p-383, 0x1.2f2b3b1ae197dp-378 }, + { 0x1.81e6d5efc2ecep-384, 0x1.c47e5b8f9de0cp-379 }, + { 0x1.1fd54f3e20bfcp-384, 0x1.51a481761d265p-379 }, + { 0x1.ad523512d80aep-385, 0x1.f7d2ff106229cp-380 }, + { 0x1.4023f854f9c86p-385, 0x1.77da522f79ec5p-380 }, + { 0x1.dd649c8fad0d5p-386, 0x1.185a192bd02b4p-380 }, + { 0x1.63e684c4d4572p-386, 0x1.a22ed5ef67f83p-381 }, + { 0x1.094b5ecc6e29p-386, 0x1.37d9a85948033p-381 }, + { 0x1.8b7643330549ep-387, 0x1.d10da89b8212ap-382 }, + { 0x1.26b65f14cd4dap-387, 0x1.5ab7d4224f7e2p-382 }, + { 0x1.b734f53e57228p-388, 0x1.0276587fa1c2p-382 }, + { 0x1.473b9d1931175p-388, 0x1.814bdb918424dp-383 }, + { 0x1.e78d8c6e84fddp-389, 0x1.1f2684f2af658p-383 }, + { 0x1.6b2a2c93cd65ap-389, 0x1.abf540fb4e1a1p-384 }, + { 0x1.0e7a7b055d281p-389, 0x1.3eddfeeed0dd2p-384 }, + { 0x1.92d87cacce695p-390, 0x1.db1c82f79707dp-385 }, + { 0x1.2bf57b6e0d98dp-390, 0x1.61ea0b7eb4c3cp-385 }, + { 0x1.bea4f9488e121p-391, 0x1.0799f1fb897d8p-385 }, + { 0x1.4c7d8bf7bdc41p-391, 0x1.889f21fdb1d69p-386 }, + { 0x1.eef6b8bfa9225p-392, 0x1.245c20ba28a39p-386 }, + { 0x1.705ed2bbfd521p-392, 0x1.b3598a0d5984p-387 }, + { 0x1.121f1b69882ebp-392, 0x1.4418fde75923ep-387 }, + { 0x1.97ec608197c79p-393, 0x1.e27e05b6c31f9p-388 }, + { 0x1.2f7b0edc74f1cp-393, 0x1.671af7f5d8858p-388 }, + { 0x1.c380c41f7503p-394, 0x1.0b3d4442eda68p-388 }, + { 0x1.4fd20f15083b3p-394, 0x1.8db341e4d4306p-389 }, + { 0x1.f37ea8d01e9c5p-395, 0x1.27e37e3bc73c9p-389 }, + { 0x1.736cebb19a201p-395, 0x1.b83a639f29a8p-390 }, + { 0x1.1428c012e2c57p-395, 0x1.47730acf38edcp-390 }, + { 0x1.9a9ae80c06018p-396, 0x1.e710d5155d028p-391 }, + { 0x1.31371c2b63b8p-396, 0x1.6a331ab64b688p-391 }, + { 0x1.c5b240b14f4d6p-397, 0x1.0d4fd25f7f52ep-391 }, + { 0x1.5129ffd17a136p-397, 0x1.90712f4e38e37p-392 }, + { 0x1.f510ba62354a5p-398, 0x1.29ac951c1e60bp-392 }, + { 0x1.74468acd1611cp-398, 0x1.ba819d5f14678p-393 }, + { 0x1.148e1d96c299ep-398, 0x1.48dce2dc3ecd5p-393 }, + { 0x1.9ad7d58aaba44p-399, 0x1.e8c0193d16d55p-394 }, + { 0x1.3121b71d77179p-399, 0x1.6b2456938b866p-394 }, + { 0x1.c52f68dd90e64p-400, 0x1.0dc826696c76cp-394 }, + { 0x1.507f397188496p-400, 0x1.90cc63cdbf2a2p-395 }, + { 0x1.f3a5bdf92c388p-401, 0x1.29af3c144f8cp-395 }, + { 0x1.72e7cbdbb95dbp-401, 0x1.ba24cc0f4c8e2p-396 }, + { 0x1.134d638b07143p-401, 0x1.48500e815d897p-396 }, + { 0x1.98a2111174d79p-402, 0x1.e7841c45926dp-397 }, + { 0x1.2f3b409e1b7b6p-402, 0x1.69ea5b1b71301p-397 }, + { 0x1.c1fa91a869695p-403, 0x1.0ca4195cda6d3p-397 }, + { 0x1.4dd4c7d7ec9fap-403, 0x1.8ec33daf13649p-398 }, + { 0x1.ef442d8796795p-404, 0x1.27eb66fea5e85p-398 }, + { 0x1.6f56f0c0f22b9p-404, 0x1.b72598c77c448p-399 }, + { 0x1.106c4a594a047p-404, 0x1.45cf12a60cb9ap-399 }, + { 0x1.9403b0e4bd1b9p-405, 0x1.e36284e81b5ffp-400 }, + { 0x1.2b8c63e7468c1p-405, 0x1.668ac570f2fc8p-400 }, + { 0x1.bc22598793379p-406, 0x1.09e8e37ef2488p-400 }, + { 0x1.4936d06178106p-406, 0x1.8a5f0c63b5c24p-401 }, + { 0x1.e7fffb3b16a7dp-407, 0x1.2469273320bdap-401 }, + { 0x1.69a431ed205ap-407, 0x1.b191b44e70edfp-402 }, + { 0x1.0bf7e7cce4d07p-407, 0x1.41655d7606103p-402 }, + { 0x1.8d11ace4d8996p-408, 0x1.dc6e2b76185d5p-403 }, + { 0x1.2625d4b960a47p-408, 0x1.6114f58eab906p-403 }, + { 0x1.b3c139841a735p-409, 0x1.05a2f4a403a4dp-403 }, + { 0x1.42ba35d81be5cp-409, 0x1.83b3c9af7ee45p-404 }, + { 0x1.ddf9fa6fc513ap-410, 0x1.1f386e3013e68p-404 }, + { 0x1.61e943a26f542p-410, 0x1.a9826f127d04dp-405 }, + { 0x1.06044c28d2704p-410, 0x1.3b26ef9596f74p-405 }, + { 0x1.83eb403668f94p-411, 0x1.d2c68adc24dd3p-406 }, + { 0x1.1f1fd15ed30fep-411, 0x1.59a199b7c8167p-406 }, + { 0x1.a8fcbdc7eab51p-412, 0x1.ffcb2bfa5b8dap-407 }, + { 0x1.3a7bfb4be9962p-412, 0x1.7adf828472cfdp-407 }, + { 0x1.d15ee90987618p-413, 0x1.1870951a86a79p-407 }, + { 0x1.584895194492p-413, 0x1.9f1bfa110cbbap-408 }, + { 0x1.fd57d7b45b3cap-414, 0x1.332fc55367264p-408 }, + { 0x1.78b8ffae32bfp-414, 0x1.c696d39db75f3p-409 }, + { 0x1.16996dab0cd1ep-414, 0x1.5051f4ea04fdfp-409 }, + { 0x1.9c046dcaa75a4p-415, 0x1.f194b2a4cb97p-410 }, + { 0x1.30a06c462f23ep-415, 0x1.700975cbb46aap-410 }, + { 0x1.c2662350ce7fap-416, 0x1.102fae0ec7794p-410 }, + { 0x1.4cec5169fb931p-416, 0x1.928c588cfb6d9p-411 }, + { 0x1.ec1db7d8e44b5p-417, 0x1.29a3060c44f3ap-411 }, + { 0x1.6babae8929706p-417, 0x1.b814aa869e0e4p-412 }, + { 0x1.0cb7ae5506e7ep-417, 0x1.454ee7edd0063p-412 }, + { 0x1.8d106f7f4047ep-418, 0x1.e0e0b72e6ef2ep-413 }, + { 0x1.255213192c405p-418, 0x1.6360f251c2f1fp-413 }, + { 0x1.b1500fc71b69ap-419, 0x1.0699a6631f93fp-413 }, + { 0x1.40052c8ba04b4p-419, 0x1.840a0d97bb129p-414 }, + { 0x1.d8a3d24511c07p-420, 0x1.1eaa023d58a69p-414 }, + { 0x1.5cfadd7b9716p-420, 0x1.a77ea01d8b821p-415 }, + { 0x1.01a47ddad3ea8p-420, 0x1.38c7c7057a652p-415 }, + { 0x1.7c5ff3799c35bp-421, 0x1.cdf6c504a93e5p-416 }, + { 0x1.18c087e86a1f3p-421, 0x1.551bff88c1175p-416 }, + { 0x1.9e64530b957f4p-422, 0x1.f7ae8590bb8p-417 }, + { 0x1.31c908986e1a8p-422, 0x1.73d293026bc2ap-417 }, + { 0x1.c33b25da2082ep-423, 0x1.12730a9790f69p-417 }, + { 0x1.4ce362055227ep-423, 0x1.951a7082f394ap-418 }, + { 0x1.eb1b0ae0a386ap-424, 0x1.2af1081b22794p-418 }, + { 0x1.6a3779e1ff3bp-424, 0x1.b925bc48353ep-419 }, + { 0x1.0b1f245435eeap-424, 0x1.4575deb5305a2p-419 }, + { 0x1.89efddb97fd18p-425, 0x1.e029ff0fc8645p-420 }, + { 0x1.227180cb0a8cap-425, 0x1.6228a92a17423p-420 }, + { 0x1.ac39e8a7de062p-426, 0x1.05302bb5e3a1ap-420 }, + { 0x1.3ba5b5279aa24p-426, 0x1.81331d3a2cc81p-421 }, + { 0x1.d145ea8ff6403p-427, 0x1.1c02d69097c72p-421 }, + { 0x1.56df011e743b9p-427, 0x1.a2c1b0ae83a64p-422 }, + { 0x1.f94750d0f9308p-428, 0x1.34ad734ae6135p-422 }, + { 0x1.7442e7172840ap-428, 0x1.c703bfdc748cdp-423 }, + { 0x1.123a683e9b9d5p-428, 0x1.4f5290291de6ep-423 }, + { 0x1.93f94a8e393e5p-429, 0x1.ee2bb5a2a447p-424 }, + { 0x1.298449094a08p-429, 0x1.6c16f34d9525ep-424 }, + { 0x1.b62c8f87855a8p-430, 0x1.0c379a70923bcp-424 }, + { 0x1.42a02f59d51efp-430, 0x1.8b21b8919710fp-425 }, + { 0x1.db09bb0ffb21fp-431, 0x1.2303a1b68b2dep-425 }, + { 0x1.5daee76f997a8p-431, 0x1.ac9c706a79cfcp-426 }, + { 0x1.01604a662bf4cp-431, 0x1.3b983b3f72fb5p-426 }, + { 0x1.7ad33d50dacdp-432, 0x1.d0b33fd9b6e85p-427 }, + { 0x1.16c1e4c8c451ap-432, 0x1.5615904c6373ap-427 }, + { 0x1.9a32159dea0d8p-433, 0x1.f7950165d693dp-428 }, + { 0x1.2dc48781056c9p-433, 0x1.729dc070c926ap-428 }, + { 0x1.bbf2871addffbp-434, 0x1.10b9b38c6e833p-428 }, + { 0x1.4684a4152d4ep-434, 0x1.9154f9f73ee5fp-429 }, + { 0x1.e03df4eb2c204p-435, 0x1.27418ebfd96bep-429 }, + { 0x1.6120558a89b12p-435, 0x1.b26192fa2f36ep-430 }, + { 0x1.03a014bcb5352p-435, 0x1.3f7df7d25b3e6p-430 }, + { 0x1.7db773a6f6623p-436, 0x1.d5ec232ba3385p-431 }, + { 0x1.1893b9023690dp-436, 0x1.598c75ff21ea4p-431 }, + { 0x1.9c6ba6a49465ap-437, 0x1.fc1f9e46a53e2p-432 }, + { 0x1.2f125d64e7642p-437, 0x1.758c452444076p-432 }, + { 0x1.bd607b51aff83p-438, 0x1.1294b791c6529p-432 }, + { 0x1.4735d5e25dd32p-438, 0x1.939e692035be7p-433 }, + { 0x1.e0bb7795ebab2p-439, 0x1.289cc9b3b4107p-433 }, + { 0x1.611962fb4b008p-439, 0x1.b3e5c199dc217p-434 }, + { 0x1.035217aa6e0adp-439, 0x1.40415be2c6028p-434 }, + { 0x1.7cd9c096da3b3p-440, 0x1.d6871e2c76342p-435 }, + { 0x1.17a22cd2a508fp-440, 0x1.599d2a64857abp-435 }, + { 0x1.9a95351e8c9f1p-441, 0x1.fba952efabe51p-436 }, + { 0x1.2d63f329a8bcbp-441, 0x1.74cc660d4897ap-436 }, + { 0x1.ba6ba0cb47e2bp-442, 0x1.11baa6a990cd8p-436 }, + { 0x1.44ae89d144108p-442, 0x1.91ecc31adec4ep-437 }, + { 0x1.dc7e8d1b8f556p-443, 0x1.270b14a1f9816p-437 }, + { 0x1.5d9a42222275cp-443, 0x1.b11d883fd3ec1p-438 }, + { 0x1.00789e350bd1ap-443, 0x1.3ddca348b8e79p-438 }, + { 0x1.7840aaba80c98p-444, 0x1.d27f9dd765764p-439 }, + { 0x1.13f45ccd8c935p-444, 0x1.56472f42babf3p-439 }, + { 0x1.94bc9a9955f26p-445, 0x1.f6359d3980ea5p-440 }, + { 0x1.28c5f3eaf8eddp-445, 0x1.7063ccd1b83c6p-440 }, + { 0x1.b32a3c3e46a35p-446, 0x1.0e31f012ad2b3p-440 }, + { 0x1.3f01c91fe7f47p-446, 0x1.8c4cd2c02ec2dp-441 }, + { 0x1.d3a718c61d154p-447, 0x1.2298481c2ca0dp-441 }, + { 0x1.56bd3dd5a05c1p-447, 0x1.aa1de55237abcp-442 }, + { 0x1.f65222fadfcp-448, 0x1.3861db33230bp-442 }, + { 0x1.700eb717cfb77p-448, 0x1.c9f401331dbf6p-443 }, + { 0x1.0da5e12700c8dp-448, 0x1.4fa3a533642f6p-443 }, + { 0x1.8b0da54d3c71fp-449, 0x1.ebed8656f1a7bp-444 }, + { 0x1.215aeed941b43p-449, 0x1.6873a105b43c2p-444 }, + { 0x1.a7d28bd609e5p-450, 0x1.081521636047p-444 }, + { 0x1.3659f3261d19p-450, 0x1.82e8d038330cap-445 }, + { 0x1.c6770887b13f6p-451, 0x1.1b65bea6b7e6ap-445 }, + { 0x1.4cb570f463d9dp-451, 0x1.9f1b427ce89a2p-446 }, + { 0x1.e715dafe5cd6p-452, 0x1.2ff9fffd4f5f9p-446 }, + { 0x1.6480ba9b1723cp-452, 0x1.bd241d06b6757p-447 }, + { 0x1.04e575dd6f2ebp-452, 0x1.45e411382662bp-447 }, + { 0x1.7dcff6d521467p-453, 0x1.dd1da1bc7ec85p-448 }, + { 0x1.1759a98201ff3p-453, 0x1.5d36e9f7af39cp-448 }, + { 0x1.98b82586ccf2dp-454, 0x1.ff233639de02ap-449 }, + { 0x1.2af6afc0ce651p-454, 0x1.7606528b3cf28p-449 }, + { 0x1.b54f244df93dfp-455, 0x1.11a8b54a30c34p-449 }, + { 0x1.3fcc4e4385b18p-455, 0x1.9066e8a3084adp-450 }, + { 0x1.d3abb2d5b9282p-456, 0x1.24e2ffedd9f78p-450 }, + { 0x1.55eaec016b2b5p-456, 0x1.ac6e23cde6ac9p-451 }, + { 0x1.f3e576e5bfb2cp-457, 0x1.394ff72563c26p-451 }, + { 0x1.6d6394041cb01p-457, 0x1.ca3259bb8013ep-452 }, + { 0x1.0b0a8012d71fbp-457, 0x1.4effb58fcce2p-452 }, + { 0x1.8647f7f3a91dep-458, 0x1.e9cac23b8427ep-453 }, + { 0x1.1d29e5c60946bp-458, 0x1.6602f707600f3p-453 }, + { 0x1.a0aa72640fd47p-459, 0x1.05a7bd790a4bcp-453 }, + { 0x1.305e23384e58ap-459, 0x1.7e6b1b23c38f4p-454 }, + { 0x1.bc9e08de1532fp-460, 0x1.176cc55ca9b8p-454 }, + { 0x1.44b4e89c6a35fp-460, 0x1.984a277e8539ap-455 }, + { 0x1.da366d9d2b975p-461, 0x1.2a417253e014bp-455 }, + { 0x1.5a3c60cb2c6b1p-461, 0x1.b3b2c9b4277c6p-456 }, + { 0x1.f98800fc076dbp-462, 0x1.3e333559670c8p-456 }, + { 0x1.71033226bf0afp-462, 0x1.d0b8591b88278p-457 }, + { 0x1.0d53e944a7e18p-462, 0x1.534ff7f271b4dp-457 }, + { 0x1.89187f3d75a14p-463, 0x1.ef6ed82d51675p-458 }, + { 0x1.1ed5d0deddfb7p-463, 0x1.69a61d0edc9d2p-458 }, + { 0x1.a28be72757b85p-464, 0x1.07f57aca805f1p-458 }, + { 0x1.3154ef266983dp-464, 0x1.814481a9f253cp-459 }, + { 0x1.bd6d859990532p-465, 0x1.1921067277b5dp-459 }, + { 0x1.44dcd404b4fcdp-465, 0x1.9a3a7d2712f82p-460 }, + { 0x1.d9cdf2aadd6a6p-466, 0x1.2b45137355f77p-460 }, + { 0x1.5979672b76b96p-466, 0x1.b497e1657b91bp-461 }, + { 0x1.f7be424410479p-467, 0x1.3e6cfcc06ed27p-461 }, + { 0x1.6f36e7903ba4fp-467, 0x1.d06cfa865bc4ep-462 }, + { 0x1.0ba8019bd4e86p-467, 0x1.52a47395ed2aep-462 }, + { 0x1.8621eaa755f34p-468, 0x1.edca8e605e67ap-463 }, + { 0x1.1c4a9efdce654p-468, 0x1.67f77ef705254p-463 }, + { 0x1.9e475b5aaea97p-469, 0x1.0660edcde1e02p-463 }, + { 0x1.2dd03980220acp-469, 0x1.7e727aec99554p-464 }, + { 0x1.b7b478b8fda1cp-470, 0x1.16b24c391593bp-464 }, + { 0x1.40424c4fd21f7p-470, 0x1.96221780dfe95p-465 }, + { 0x1.d276d459f43c7p-471, 0x1.27e2788696d86p-465 }, + { 0x1.53aa8c500f5dp-471, 0x1.af1357749947cp-466 }, + { 0x1.ee9c5073f397ep-472, 0x1.39fac2bf7a531p-466 }, + { 0x1.6812e6a2e8fcp-472, 0x1.c9538eaa71fbp-467 }, + { 0x1.06198ecffc0ep-472, 0x1.4d04b3a802aeep-467 }, + { 0x1.7d857ef6fe55ap-473, 0x1.e4f0604536408p-468 }, + { 0x1.15a4dc243cc5fp-473, 0x1.610a0b4ec8401p-468 }, + { 0x1.940cad97ee071p-474, 0x1.00fbde3ac71c6p-468 }, + { 0x1.25f772e00c70ap-474, 0x1.7614bf61d6bfap-469 }, + { 0x1.abb2fd3f529efp-475, 0x1.103beefa0765p-469 }, + { 0x1.3718d87e8a0afp-475, 0x1.8c2ef94786008p-470 }, + { 0x1.c48328a4346ebp-476, 0x1.203fa39242793p-470 }, + { 0x1.4910b37b4de72p-476, 0x1.a36313f8e64ecp-471 }, + { 0x1.de8817c6f33b9p-477, 0x1.310e5f6fbfd44p-471 }, + { 0x1.5be6c950a7e6fp-477, 0x1.bbbb999bb060ap-472 }, + { 0x1.f9ccdcf7c94fep-478, 0x1.42afa66f9fdc1p-472 }, + { 0x1.6fa2fc442a9d3p-478, 0x1.d54340d9c375dp-473 }, + { 0x1.0b2e58cb15f5cp-478, 0x1.552b1ae6aeaa2p-473 }, + { 0x1.844d490056942p-479, 0x1.f004e9f45a94bp-474 }, + { 0x1.1a217943b9ac7p-479, 0x1.68887b7750462p-474 }, + { 0x1.99edc3fa555f4p-480, 0x1.0605cdc8a1e5ep-474 }, + { 0x1.29c58e31af831p-480, 0x1.7ccfa0b55e3f7p-475 }, + { 0x1.b08c96a2d341cp-481, 0x1.14b13fa04509fp-475 }, + { 0x1.3a2063aa9bfc9p-481, 0x1.92087a96ea8f4p-476 }, + { 0x1.c831fc61280f7p-482, 0x1.240a6edc95f53p-476 }, + { 0x1.4b37d15842e1dp-482, 0x1.a83b0db0fa5b6p-477 }, + { 0x1.e0e63f582488bp-483, 0x1.34170d65d2fe5p-477 }, + { 0x1.5d11b81c3fea7p-483, 0x1.bf6f703f6c8b1p-478 }, + { 0x1.fab1b4f400c2ep-484, 0x1.44dcd884a52dcp-478 }, + { 0x1.6fb3ff8ccf41cp-484, 0x1.d7adc6f76430fp-479 }, + { 0x1.0ace5d20891a2p-484, 0x1.5661968fc8c68p-479 }, + { 0x1.8324934a763f4p-485, 0x1.f0fe41a3b588bp-480 }, + { 0x1.18d7d8058e531p-485, 0x1.68ab147365bffp-480 }, + { 0x1.9769602e7d2c4p-486, 0x1.05b48bc57ed71p-480 }, + { 0x1.27797b62a04a4p-486, 0x1.7bbf2311e9661p-481 }, + { 0x1.ac8851524d431p-487, 0x1.137b41cf9c9a4p-481 }, + { 0x1.36b7751d5da7fp-487, 0x1.8fa3947e525d9p-482 }, + { 0x1.c2874cefea298p-488, 0x1.21d7603b6e2ccp-482 }, + { 0x1.4695ee8470b66p-488, 0x1.a45e3910021acp-483 }, + { 0x1.d96c311be3eb3p-489, 0x1.30cd0207d04edp-483 }, + { 0x1.571909f179506p-489, 0x1.b9f4dc504a668p-484 }, + { 0x1.f13cd05945d89p-490, 0x1.40603dadb780ap-484 }, + { 0x1.6844e0504f766p-490, 0x1.d06d41c212c13p-485 }, + { 0x1.04ff770417c7ep-490, 0x1.509522cc01f2fp-485 }, + { 0x1.7a1d7e8c27e5p-491, 0x1.e7cd2184183ebp-486 }, + { 0x1.11dc1d57f7df8p-491, 0x1.616fb7b910c11p-486 }, + { 0x1.8ca6e2e342651p-492, 0x1.000d1267395e3p-486 }, + { 0x1.1f372812d1e14p-492, 0x1.72f3f6faafe57p-487 }, + { 0x1.9fe4fa21e8c98p-493, 0x1.0cacf12619fe1p-487 }, + { 0x1.2d1356c845fd1p-493, 0x1.8525cca4f244dp-488 }, + { 0x1.b3db9cc5a58f3p-494, 0x1.19c8ed29100e2p-488 }, + { 0x1.3b7359a6b9391p-494, 0x1.980913a0c5f1ep-489 }, + { 0x1.c88e8c09b9bb2p-495, 0x1.2763b979d57b5p-489 }, + { 0x1.4a59cf5958098p-495, 0x1.aba192db244fdp-490 }, + { 0x1.de016eddfacadp-496, 0x1.357ff9fbc97f4p-490 }, + { 0x1.59c942db45eaep-496, 0x1.bff2fa5de1e9dp-491 }, + { 0x1.f437cec9632b8p-497, 0x1.44204156d00fcp-491 }, + { 0x1.69c4293cefa3fp-497, 0x1.d500e0534289dp-492 }, + { 0x1.059a8a5ce0ce7p-497, 0x1.53470ed39dd97p-492 }, + { 0x1.7a4cdf5c8de47p-498, 0x1.eacebdf5973c2p-493 }, + { 0x1.117e42e10afc5p-498, 0x1.62f6cc2a62dbdp-493 }, + { 0x1.8b65a792fe14p-499, 0x1.00aff63626acfp-493 }, + { 0x1.1dc89fe4a5f8ap-499, 0x1.7331cb44dd6ecp-494 }, + { 0x1.9d10a7562f377p-500, 0x1.0c5bd0cbfba3p-494 }, + { 0x1.2a7b1b1593291p-500, 0x1.83fa43f4f73d5p-495 }, + { 0x1.af4fe4d278bf9p-501, 0x1.186c76677c8f7p-495 }, + { 0x1.37971726a776ep-501, 0x1.955251a12574cp-496 }, + { 0x1.c225447c48b85p-502, 0x1.24e359c6528bbp-496 }, + { 0x1.451dde15504ecp-502, 0x1.a73bf0e7dcf7bp-497 }, + { 0x1.d592869bae136p-503, 0x1.31c1d70a5a26cp-497 }, + { 0x1.53109f6b70a02p-503, 0x1.b9b8fd3b82acep-498 }, + { 0x1.e99944d35a898p-504, 0x1.3f09320694d4p-498 }, + { 0x1.61706e7ea0b42p-504, 0x1.cccb2e7856e93p-499 }, + { 0x1.fe3aefa4cdaa2p-505, 0x1.4cba948866255p-499 }, + { 0x1.703e40ae0b133p-505, 0x1.e0741675f15a5p-500 }, + { 0x1.09bc65f9b8064p-505, 0x1.5ad70c9e433d4p-500 }, + { 0x1.7f7aeba02f7efp-506, 0x1.f4b51e95f89d5p-501 }, + { 0x1.14a9f8443d058p-506, 0x1.695f8add0a062p-501 }, + { 0x1.8f272381e3222p-507, 0x1.04c7c2a8ead79p-501 }, + { 0x1.1fe6a1ccca721p-507, 0x1.7854e0a5444cfp-502 }, + { 0x1.9f437947f2743p-508, 0x1.0f822de49bc54p-502 }, + { 0x1.2b72bc2a1bb29p-508, 0x1.87b7be69a8c26p-503 }, + { 0x1.afd058f4d5cb9p-509, 0x1.1a8a41a9a734p-503 }, + { 0x1.374e8637e822fp-509, 0x1.9788b1f83908ep-504 }, + { 0x1.c0ce07e3f5247p-510, 0x1.25e0558a5c077p-504 }, + { 0x1.437a22e46ffc9p-510, 0x1.a7c824c7683f1p-505 }, + { 0x1.d23ca31c0220cp-511, 0x1.3184a6ce13b46p-505 }, + { 0x1.4ff5980398e02p-511, 0x1.b8765a48c0cf1p-506 }, + { 0x1.e41c1da9f8a5fp-512, 0x1.3d775743f06aep-506 }, + { 0x1.5cc0cd28b81e5p-512, 0x1.c9936e428a9d9p-507 }, + { 0x1.f66c3f065ea05p-513, 0x1.49b86c1b194cep-507 }, + { 0x1.69db8a882e29p-513, 0x1.db1f5331fbe71p-508 }, + { 0x1.049650c331274p-513, 0x1.5647ccc18e717p-508 }, + { 0x1.774577e1faf4fp-514, 0x1.ed19d0b78718cp-509 }, + { 0x1.0e2e586d3df5cp-514, 0x1.632541cab3acp-509 }, + { 0x1.84fe1b767669bp-515, 0x1.ff82820edeaabp-510 }, + { 0x1.17fdd44e1dc6cp-515, 0x1.705073deb552ap-510 }, + { 0x1.9304d9065a4b9p-516, 0x1.092c6a4a26abfp-510 }, + { 0x1.220449767742ap-516, 0x1.7dc8eab3ed87ap-511 }, + { 0x1.a158f0df4c356p-517, 0x1.12ce032c827cep-511 }, + { 0x1.2c4123936432bp-517, 0x1.8b8e0c1372c25p-512 }, + { 0x1.aff97ef6163edp-518, 0x1.1ca5926404568p-512 }, + { 0x1.36b3b4511d82bp-518, 0x1.999f1ae9f978bp-513 }, + { 0x1.bee57a0fbbbdcp-519, 0x1.26b285aeabdbep-513 }, + { 0x1.415b32c89327cp-519, 0x1.a7fb366632c72p-514 }, + { 0x1.ce1bb2fa9523ep-520, 0x1.30f431387ee69p-514 }, + { 0x1.4c36baf8c2285p-520, 0x1.b6a15925d0c25p-515 }, + { 0x1.dd9ad3d89a4a5p-521, 0x1.3b69cf0bd5608p-515 }, + { 0x1.57454d4c97f21p-521, 0x1.c590587256b75p-516 }, + { 0x1.ed615f7bfd7d2p-522, 0x1.46127e8d37ba7p-516 }, + { 0x1.6285ce2e2e29bp-522, 0x1.d4c6e38ed7f06p-517 }, + { 0x1.fd6db0d73348ep-523, 0x1.50ed44039bd53p-517 }, + { 0x1.6df705a8252f7p-523, 0x1.e4438317c2a1ep-518 }, + { 0x1.06defd40bdb09p-523, 0x1.5bf9082dc8412p-518 }, + { 0x1.79979f15ddb0dp-524, 0x1.f4049875ce63p-519 }, + { 0x1.0f2823287afb6p-524, 0x1.673497e5a0d03p-519 }, + { 0x1.856628e34ac2cp-525, 0x1.02042eb28efefp-519 }, + { 0x1.17913a85a33a7p-525, 0x1.729ea3d219a53p-520 }, + { 0x1.9161145d0e326p-526, 0x1.0a2671c8cdbeep-520 }, + { 0x1.20191f16dc709p-526, 0x1.7e35c0288722ep-521 }, + { 0x1.9d86b59187f4ep-527, 0x1.12680a24c58f5p-521 }, + { 0x1.28be97e6e9065p-527, 0x1.89f8647df9662p-522 }, + { 0x1.a9d5434377e7bp-528, 0x1.1ac7d823a316cp-522 }, + { 0x1.31805749922c3p-528, 0x1.95e4eba9494cap-523 }, + { 0x1.b64ad6eec66d3p-529, 0x1.2344a7c981006p-523 }, + { 0x1.3a5cfae5998ecp-529, 0x1.a1f993b67371dp-524 }, + { 0x1.c2e56cdffce02p-530, 0x1.2bdd30bebc795p-524 }, + { 0x1.43530bcc0ee3ap-530, 0x1.ae347debd307p-525 }, + { 0x1.cfa2e45eea63dp-531, 0x1.3490165a1de5p-525 }, + { 0x1.4c60fe9d5cbc1p-531, 0x1.ba93aee1c301fp-526 }, + { 0x1.dc80ffece4451p-532, 0x1.3d5be7b8309a9p-526 }, + { 0x1.558533bc564e3p-532, 0x1.c7150ead1fd0ep-527 }, + { 0x1.e97d659702f92p-533, 0x1.463f1fe01b7dap-527 }, + { 0x1.5ebdf78f85a03p-533, 0x1.d3b6691d169e3p-528 }, + { 0x1.f6959f5cadd73p-534, 0x1.4f3825f642bp-528 }, + { 0x1.680982d0eea8ap-534, 0x1.e0756e0ca137bp-529 }, + { 0x1.01e38dd55bfc7p-534, 0x1.58454d7cf072p-529 }, + { 0x1.7165faec70a1p-535, 0x1.ed4fb1c7fef16p-530 }, + { 0x1.088796f5a026p-535, 0x1.6164d6a338985p-530 }, + { 0x1.7ad1726ce2f3cp-536, 0x1.fa42ad866b6p-531 }, + { 0x1.0f3587953aeb5p-536, 0x1.6a94eea23ecd2p-531 }, + { 0x1.8449e977fef01p-537, 0x1.03a5dffc21d0dp-531 }, + { 0x1.15ebef6827c9dp-537, 0x1.73d3b028fc2cfp-532 }, + { 0x1.8dcd4e591ac76p-538, 0x1.0a3416f4dd0f1p-532 }, + { 0x1.1ca951b79a938p-538, 0x1.7d1f23d694b62p-533 }, + { 0x1.97597e1aad586p-539, 0x1.10ca917d13a59p-533 }, + { 0x1.236c25d3c18a2p-539, 0x1.867540c340902p-534 }, + { 0x1.a0ec452e85047p-540, 0x1.1767d933fa0f7p-534 }, + { 0x1.2a32d78fe110fp-540, 0x1.8fd3ed17c059fp-535 }, + { 0x1.aa8360248e3edp-541, 0x1.1e0a6bf884441p-535 }, + { 0x1.30fbc7c8ab284p-541, 0x1.9938feb3469d1p-536 }, + { 0x1.b41c7c6ff8cc6p-542, 0x1.24b0bc63cac6bp-536 }, + { 0x1.37c54cf4ab1fcp-542, 0x1.a2a23bdfb3241p-537 }, + { 0x1.bdb5393a7ccd2p-543, 0x1.2b59324d7fd9bp-537 }, + { 0x1.3e8db3be9418cp-543, 0x1.ac0d5c13ef72ap-538 }, + { 0x1.c74b284572b4cp-544, 0x1.32022b5a4d882p-538 }, + { 0x1.45533fa93710cp-544, 0x1.b57808c42df0bp-539 }, + { 0x1.d0dbced86364cp-545, 0x1.38a9fb93eb86p-539 }, + { 0x1.4c142bbcdb51bp-545, 0x1.bedfde3fbf9f1p-540 }, + { 0x1.da64a6bca7adp-546, 0x1.3f4eee0ab230dp-540 }, + { 0x1.52ceab3daa53bp-546, 0x1.c8426c9c266d4p-541 }, + { 0x1.e3e31f45a0a96p-547, 0x1.45ef458066425p-541 }, + { 0x1.5980ea6ad6692p-547, 0x1.d19d38acfc932p-542 }, + { 0x1.ed549e6504cf2p-548, 0x1.4c893d1bef1fep-542 }, + { 0x1.60290f4619f98p-548, 0x1.daedbd083bb8ep-543 }, + { 0x1.f6b681cab013bp-549, 0x1.531b0925a021ep-543 }, + { 0x1.66c53a6323b06p-549, 0x1.e4316b16614afp-544 }, + { 0x1.00031007ac3e3p-549, 0x1.59a2d7cbb3c39p-544 }, + { 0x1.6d5387be7adf6p-550, 0x1.ed65ac2de0264p-545 }, + { 0x1.04a064f4bdd38p-550, 0x1.601ed1ee8e719p-545 }, + { 0x1.73d20f9b5e73bp-551, 0x1.f687e2b942e41p-546 }, + { 0x1.0931e5b5e6c43p-551, 0x1.668d1bf455ad8p-546 }, + { 0x1.7a3ee7681856fp-552, 0x1.ff956b675583bp-547 }, + { 0x1.0db636a632668p-552, 0x1.6cebd6a35f863p-547 }, + { 0x1.809822a836e1fp-553, 0x1.0445cf3250898p-547 }, + { 0x1.122bfb19eafe7p-553, 0x1.73392002f5fc2p-548 }, + { 0x1.86dbd3e416493p-554, 0x1.08b3e84ebc2b9p-548 }, + { 0x1.1691d609b1ec9p-554, 0x1.79731441e1e21p-549 }, + { 0x1.8d080d9d1c96dp-555, 0x1.0d13aa83e4b01p-549 }, + { 0x1.1ae66ac0b0b6ap-555, 0x1.7f97cea22928bp-550 }, + { 0x1.931ae34603f62p-556, 0x1.1163bef9eebc1p-550 }, + { 0x1.1f285d8d6c817p-556, 0x1.85a56a6965552p-551 }, + { 0x1.99126a3e88ca5p-557, 0x1.15a2cf3193875p-551 }, + { 0x1.23565474c154ep-557, 0x1.8b9a03d510324p-552 }, + { 0x1.9eecbad1cb519p-558, 0x1.19cf85b21a11fp-552 }, + { 0x1.276ef7e686addp-558, 0x1.9173b9121e9f7p-553 }, + { 0x1.a4a7f136af77ep-559, 0x1.1de88eb969b39p-553 }, + { 0x1.2b70f3735b79fp-559, 0x1.9730ab373bc61p-554 }, + { 0x1.aa422e918100dp-560, 0x1.21ec98edb9593p-554 }, + { 0x1.2f5af68314ac2p-560, 0x1.9cceff40f1fb1p-555 }, + { 0x1.afb999f61e5d4p-561, 0x1.25da56105b758p-555 }, + { 0x1.332bb50b471fbp-561, 0x1.a24cdf0f0a2e7p-556 }, + { 0x1.b50c6169e961bp-562, 0x1.29b07bb123c75p-556 }, + { 0x1.36e1e845638bbp-562, 0x1.a7a87a6267113p-557 }, + { 0x1.ba38bae4baa67p-563, 0x1.2d6dc3e1e1b47p-557 }, + { 0x1.3a7c4f63d9d53p-563, 0x1.ace007da9e0c8p-558 }, + { 0x1.bf3ce55012ad1p-564, 0x1.3110ede9680cep-558 }, + { 0x1.3df9b045b81fcp-564, 0x1.b1f1c5f28dcc9p-559 }, + { 0x1.c4172983c2f7ep-565, 0x1.3498bef599a58p-559 }, + { 0x1.4158d828399aep-565, 0x1.b6dbfbfb30836p-560 }, + { 0x1.c8c5db3f49157p-566, 0x1.380402cbf1542p-560 }, + { 0x1.44989c55b9312p-566, 0x1.bb9cfb13e7262p-561 }, + { 0x1.cd475a1f163eep-567, 0x1.3b518c77fb7d2p-561 }, + { 0x1.47b7dad17cf31p-567, 0x1.c0331f1f7ac71p-562 }, + { 0x1.d19a128cff8a4p-568, 0x1.3e8036f737914p-562 }, + { 0x1.4ab57affd05a9p-568, 0x1.c49ccfb511d2cp-563 }, + { 0x1.d5bc7eab14dfbp-569, 0x1.418ee5e1d890ep-563 }, + { 0x1.4d906e49e5535p-569, 0x1.c8d8810c585d4p-564 }, + { 0x1.d9ad27381fd3dp-570, 0x1.447c860fdcf2cp-564 }, + { 0x1.5047b0bcf6527p-570, 0x1.cce4b4e41cdcap-565 }, + { 0x1.dd6aa46d0f45cp-571, 0x1.47480e39f8181p-565 }, + { 0x1.52da49a426b16p-571, 0x1.d0bffb62a59f5p-566 }, + { 0x1.e0f39ed2991f9p-572, 0x1.49f07f95c9d66p-566 }, + { 0x1.55474c1ca1f2bp-572, 0x1.d468f3ef07049p-567 }, + { 0x1.e446d00e60d84p-573, 0x1.4c74e66ce3841p-567 }, + { 0x1.578dd7a37e92bp-573, 0x1.d7de4e02c6f6fp-568 }, + { 0x1.e76303a6f7572p-574, 0x1.4ed45aae1d60cp-568 }, + { 0x1.59ad189ced845p-574, 0x1.db1ec9f31f5e1p-569 }, + { 0x1.ea4717be0f8c8p-575, 0x1.510e0078c325ep-569 }, + { 0x1.5ba448d444792p-575, 0x1.de2939b1372f7p-570 }, + { 0x1.ecf1fdc04a7dbp-576, 0x1.532108a122ff3p-570 }, + { 0x1.5d72aff4768dap-576, 0x1.e0fc8180b06b8p-571 }, + { 0x1.ef62bb0a0594ap-577, 0x1.550cb12e0f1dbp-571 }, + { 0x1.5f17a3f894e1dp-577, 0x1.e39798a3f0a89p-572 }, + { 0x1.f19869809eb8ap-578, 0x1.56d045cee7811p-572 }, + { 0x1.60928993f7077p-578, 0x1.e5f989fd91cadp-573 }, + { 0x1.f392381fab056p-579, 0x1.586b2049c7737p-573 }, + { 0x1.61e2d491b1f68p-579, 0x1.e82174a67122fp-574 }, + { 0x1.f54f6b79a6d5fp-580, 0x1.59dca8e17880fp-574 }, + { 0x1.6308082b0b65cp-580, 0x1.ea0e8c77dc629p-575 }, + { 0x1.f6cf5e2bb03dcp-581, 0x1.5b2456b2d3672p-575 }, + { 0x1.6401b7549eebbp-581, 0x1.ebc01a8965943p-576 }, + { 0x1.f8118143e7ebp-582, 0x1.5c41b0093e8e9p-576 }, + { 0x1.64cf8501f223bp-582, 0x1.ed357da1f18bap-577 }, + { 0x1.f9155c9a1fbd1p-583, 0x1.5d344aaa010f1p-577 }, + { 0x1.6571245f3d39ap-583, 0x1.ee6e2a9b9efdp-578 }, + { 0x1.f9da8f1a8a0ccp-584, 0x1.5dfbcc1628fd2p-578 }, + { 0x1.65e6590135ap-584, 0x1.ef69acba2f951p-579 }, + { 0x1.fa60cf0228aadp-585, 0x1.5e97e9c2cbc7fp-579 }, + { 0x1.662ef70ab154bp-585, 0x1.f027a5f3a7f56p-580 }, + { 0x1.faa7ea0cc6ecbp-586, 0x1.5f0869476fb64p-580 }, + { 0x1.664ae34801e0ep-586, 0x1.f0a7cf2ae7563p-581 }, + { 0x1.faafc59456a8cp-587, 0x1.5f4d2082760f5p-581 }, + { 0x1.663a133fef35p-587, 0x1.f0e9f85c03b41p-582 }, + { 0x1.fa785ea194bf2p-588, 0x1.5f65f5b366281p-582 }, + { 0x1.65fc8d3a43882p-588, 0x1.f0ee08ba43cd5p-583 }, + { 0x1.fa01c9ede6a16p-589, 0x1.5f52df8b025d3p-583 }, + { 0x1.6592683be2829p-589, 0x1.f0b3febf9cbcdp-584 }, + { 0x1.f94c33d66f35bp-590, 0x1.5f13e53118eaap-584 }, + { 0x1.64fbcbf86f1abp-590, 0x1.f03bf02da5a7ap-585 }, + { 0x1.f857e040665ap-591, 0x1.5ea91e400b8afp-585 }, + { 0x1.6438f0b98cabp-591, 0x1.ef860a0000a7ap-586 }, + { 0x1.f7252a6ecb2bbp-592, 0x1.5e12b2b611c72p-586 }, + { 0x1.634a1f3bd0d7ep-592, 0x1.ee92905044d53p-587 }, + { 0x1.f5b484c995f72p-593, 0x1.5d50dadc42d9dp-587 }, + { 0x1.622fb08184d56p-593, 0x1.ed61de2b81fc4p-588 }, + { 0x1.f40678969b4f4p-594, 0x1.5c63df237cf4dp-588 }, + { 0x1.60ea0d9b5d711p-594, 0x1.ebf4655983167p-589 }, + { 0x1.f21ba5a45e2afp-595, 0x1.5b4c17f7488b1p-589 }, + { 0x1.5f79af6759efdp-595, 0x1.ea4aae160108ap-590 }, + { 0x1.eff4c1e71b057p-596, 0x1.5a09ed86def16p-590 }, + { 0x1.5ddf1e460242cp-596, 0x1.e86556bc034fep-591 }, + { 0x1.ed92990861c73p-597, 0x1.589dd784842fp-591 }, + { 0x1.5c1af1c6454bep-597, 0x1.e6451363b8311p-592 }, + { 0x1.eaf60be99fa59p-598, 0x1.57085cdb6c23ep-592 }, + { 0x1.5a2dd0483fd76p-598, 0x1.e3eaad7319948p-593 }, + { 0x1.e820101a05296p-599, 0x1.554a135c6b3d2p-593 }, + { 0x1.58186e973c8cbp-599, 0x1.e1570321beee3p-594 }, + { 0x1.e511af403f0e1p-600, 0x1.53639f61bab8bp-594 }, + { 0x1.55db8f7b445c6p-600, 0x1.de8b06f0475d8p-595 }, + { 0x1.e1cc067882b19p-601, 0x1.5155b36a1ff17p-595 }, + { 0x1.537803429dd3dp-601, 0x1.db87bf13d1856p-596 }, + { 0x1.de5045a77840fp-602, 0x1.4f210fabcd4fep-596 }, + { 0x1.50eea743a03bp-602, 0x1.d84e44d6006fdp-597 }, + { 0x1.da9faec295ac1p-603, 0x1.4cc6819f5a3a9p-597 }, + { 0x1.4e406557456e3p-603, 0x1.d4dfc3ea1615fp-598 }, + { 0x1.d6bb950e85a76p-604, 0x1.4a46e38335bf7p-598 }, + { 0x1.4b6e334ceafc3p-604, 0x1.d13d79b7b4d75p-599 }, + { 0x1.d2a55c543d97bp-605, 0x1.47a31bd7fd98ap-599 }, + { 0x1.48791257b832ep-605, 0x1.cd68b49be13bdp-600 }, + { 0x1.ce5e780d6c294p-606, 0x1.44dc1cd628aecp-600 }, + { 0x1.45620e7623619p-606, 0x1.c962d320e4c77p-601 }, + { 0x1.c9e86a88f07ffp-607, 0x1.41f2e3dd79383p-601 }, + { 0x1.422a3dd414b5ep-607, 0x1.c52d432db963cp-602 }, + { 0x1.c544c4080f626p-608, 0x1.3ee878deaf1c1p-602 }, + { 0x1.3ed2c02828af5p-608, 0x1.c0c9812daaed1p-603 }, + { 0x1.c07521d52071ep-609, 0x1.3bbdedbff743p-603 }, + { 0x1.3b5cbe0c97302p-609, 0x1.bc391730e1bf4p-604 }, + { 0x1.bb7b2d547171ap-610, 0x1.38745dbc97fd1p-604 }, + { 0x1.37c9685446b6bp-610, 0x1.b77d9c068db21p-605 }, + { 0x1.b6589b1020c3ep-611, 0x1.350cecc05d9cfp-605 }, + { 0x1.3419f75c953bcp-611, 0x1.b298b2516cc35p-606 }, + { 0x1.b10f29bfb2a68p-612, 0x1.3188c6bf4cd49p-606 }, + { 0x1.304faa5c619afp-612, 0x1.ad8c07976bbcp-607 }, + { 0x1.aba0a14c264ccp-613, 0x1.2de91f0a22435p-607 }, + { 0x1.2c6bc6b0e1424p-613, 0x1.a859534d21642p-608 }, + { 0x1.a60ed1d150c44p-614, 0x1.2a2f2fa027fc3p-608 }, + { 0x1.286f9728ce321p-614, 0x1.a30255dde65bep-609 }, + { 0x1.a05b929d439abp-615, 0x1.265c387eea954p-609 }, + { 0x1.245c6b4e79163p-615, 0x1.9d88d7b14c6d3p-610 }, + { 0x1.9a88c12e847c2p-616, 0x1.22717ef05792fp-610 }, + { 0x1.203396b14a77p-616, 0x1.97eea82eb8229p-611 }, + { 0x1.94984031d9858p-617, 0x1.1e704cd7ceb7cp-611 }, + { 0x1.1bf6702f3caf4p-617, 0x1.92359cbfdea74p-612 }, + { 0x1.8e8bf6806bcabp-618, 0x1.1a59effeaeef1p-612 }, + { 0x1.17a6513ed67fap-618, 0x1.8c5f8fd2e86f6p-613 }, + { 0x1.8865ce1efe9b6p-619, 0x1.162fb960e6361p-613 }, + { 0x1.1344953a2bc16p-619, 0x1.866e5fdcf6e5cp-614 }, + { 0x1.8227b33ef66f4p-620, 0x1.11f2fc7a0a0a9p-614 }, + { 0x1.0ed298ab66e97p-620, 0x1.8063ee5dc8676p-615 }, + { 0x1.7bd39341e60d2p-621, 0x1.0da50e937b941p-615 }, + { 0x1.0a51b89b5ac38p-621, 0x1.7a421ee53231bp-616 }, + { 0x1.756b5bc0538cfp-622, 0x1.0947461417eb2p-616 }, + { 0x1.05c351e298147p-622, 0x1.740ad61b23997p-617 }, + { 0x1.6ef0f9946142ep-623, 0x1.04daf9d1f19dp-617 }, + { 0x1.0128c07d7eac9p-623, 0x1.6dbff8cae0f32p-618 }, + { 0x1.686657e900799p-624, 0x1.006180668cd93p-618 }, + { 0x1.f906bdc779cfcp-625, 0x1.67636af21f0cbp-619 }, + { 0x1.61cd5f4e4d33cp-625, 0x1.f7b85f0c272bbp-620 }, + { 0x1.efa90ac757637p-626, 0x1.60f70ed4a200ep-620 }, + { 0x1.5b27f4d3aafafp-626, 0x1.ee98b6b3e4f34p-621 }, + { 0x1.e63b1303dfbfbp-627, 0x1.5a7cc414fb8aap-621 }, + { 0x1.5477f92833195p-627, 0x1.e566abbe94f87p-622 }, + { 0x1.dcbf7abb88524p-628, 0x1.53f666d2fde17p-622 }, + { 0x1.4dbf47c1fc8ap-628, 0x1.dc24dc933bf6dp-623 }, + { 0x1.d338de3492428p-629, 0x1.4d65ced070949p-623 }, + { 0x1.46ffb60cbd76p-629, 0x1.d2d5e0d43505p-624 }, + { 0x1.c9a9d09a6515fp-630, 0x1.46ccce9c8cdf5p-624 }, + { 0x1.403b12a03d499p-630, 0x1.c97c4837b573ep-625 }, + { 0x1.c014dae645fc3p-631, 0x1.402d32c6be96dp-625 }, + { 0x1.3973247f05596p-631, 0x1.c01a996aebdb3p-626 }, + { 0x1.b67c7ad400b86p-632, 0x1.3988c1191e211p-626 }, + { 0x1.32a9aa5db4bb3p-632, 0x1.b6b3510058b7ap-627 }, + { 0x1.ace321e309c7bp-633, 0x1.32e137db0ef23p-627 }, + { 0x1.2be059f3526f7p-633, 0x1.ad48e069f2207p-628 }, + { 0x1.a34b346493cc3p-634, 0x1.2c384d1c64d5bp-628 }, + { 0x1.2518df52ef492p-634, 0x1.a3ddacff96f65p-629 }, + { 0x1.99b70897047dcp-635, 0x1.258fae0968e74p-629 }, + { 0x1.1e54dc4edf3a3p-635, 0x1.9a740f1248851p-630 }, + { 0x1.9028e5cf277c7p-636, 0x1.1ee8fe480d92cp-630 }, + { 0x1.1795e7e5c7ccap-636, 0x1.910e510c93fe1p-631 }, + { 0x1.86a303af6f699p-637, 0x1.1845d75e974c6p-631 }, + { 0x1.10dd8db9b7b2p-637, 0x1.87aeaea087811p-632 }, + { 0x1.7d27896d87b8ep-638, 0x1.11a7c823f5ff5p-632 }, + { 0x1.0a2d4d917179ap-638, 0x1.7e57540380a9p-633 }, + { 0x1.73b88d266bc5ap-639, 0x1.0b10543a01766p-633 }, + { 0x1.03869ae409b27p-639, 0x1.750a5d3814d59p-634 }, + { 0x1.6a58134129f18p-640, 0x1.0480f391c14fcp-634 }, + { 0x1.f9d5b8ddde221p-641, 0x1.6bc9d56645be6p-635 }, + { 0x1.61080de06bfbp-641, 0x1.fbf623f3bedbap-636 }, + { 0x1.ecb6d7acd34f7p-642, 0x1.6297b642274f2p-636 }, + { 0x1.57ca5c62d05ddp-642, 0x1.ef001d6eb49dfp-637 }, + { 0x1.dfb32aa129cc6p-643, 0x1.5975e7810e7p-637 }, + { 0x1.4ea0caf213789p-643, 0x1.e222785106b16p-638 }, + { 0x1.d2cd2eb59de4cp-644, 0x1.50663e5d53392p-638 }, + { 0x1.458d1220fa79dp-644, 0x1.d55fbee497ep-639 }, + { 0x1.c60744f31e198p-645, 0x1.476a7d28a437bp-639 }, + { 0x1.3c90d697e5b5dp-645, 0x1.c8ba606fb6833p-640 }, + { 0x1.b963b20518321p-646, 0x1.3e8452ecdbe84p-640 }, + { 0x1.33ada8cfe418fp-646, 0x1.bc34b0b8bbc6p-641 }, + { 0x1.ace49de2283aep-647, 0x1.35b55b1b3d652p-641 }, + { 0x1.2ae504dc15f24p-647, 0x1.afd0e79df00ebp-642 }, + { 0x1.a08c1388db34fp-648, 0x1.2cff1d49f192cp-642 }, + { 0x1.223852412258p-648, 0x1.a39120c175c51p-643 }, + { 0x1.945c00d028182p-649, 0x1.24630cff92d39p-643 }, + { 0x1.19a8e3da77fbep-649, 0x1.97775b48ec1aap-644 }, + { 0x1.8856364b336c5p-650, 0x1.1be2898c8a8a4p-644 }, + { 0x1.1137f7cd08642p-650, 0x1.8b8579b06ca2cp-645 }, + { 0x1.7c7c673fe436ep-651, 0x1.137eddf1f97aep-645 }, + { 0x1.08e6b787233bap-651, 0x1.7fbd41b078795p-646 }, + { 0x1.70d029afc4472p-652, 0x1.0b3940d5da6fcp-646 }, + { 0x1.00b637cd0ec0bp-652, 0x1.74205c365c73ep-647 }, + { 0x1.6552f6729a259p-653, 0x1.0312d48405757p-647 }, + { 0x1.f14ef1a3e4ac2p-654, 0x1.68b0556e87723p-648 }, + { 0x1.5a06296220023p-654, 0x1.f6194df7630e5p-649 }, + { 0x1.e176ccb941b53p-655, 0x1.5d6e9ce0425a7p-649 }, + { 0x1.4eeb0196310cdp-655, 0x1.e64f64121563ep-650 }, + { 0x1.d1e5afef936dap-656, 0x1.525c859a2ea9ap-650 }, + { 0x1.4402a1b0bd9dfp-656, 0x1.d6c9b6d4d6fc5p-651 }, + { 0x1.c29d225a230e3p-657, 0x1.477b466ee6cc1p-651 }, + { 0x1.394e1038ce88ep-657, 0x1.c789ea0183d02p-652 }, + { 0x1.b39e83951bdaap-658, 0x1.3ccbfa4112a58p-652 }, + { 0x1.2ece3803d8d68p-658, 0x1.b8917a154498bp-653 }, + { 0x1.a4eb0c6436cf4p-659, 0x1.324fa05e3adc4p-653 }, + { 0x1.2483e8ac9d061p-659, 0x1.a9e1bcd30af1fp-654 }, + { 0x1.9683cf6400112p-660, 0x1.28071ce79e917p-654 }, + { 0x1.1a6fd716c7c18p-660, 0x1.9b7be1e1550cbp-655 }, + { 0x1.8869b9cc95345p-661, 0x1.1df33948493fap-655 }, + { 0x1.10929dfe85b79p-661, 0x1.8d60f37a227b9p-656 }, + { 0x1.7a9d9444b613ep-662, 0x1.1414a4b7a1729p-656 }, + { 0x1.06ecbe9338febp-662, 0x1.7f91d72bfd333p-657 }, + { 0x1.6d2003c3fdf54p-663, 0x1.0a6bf4c7a4f95p-657 }, + { 0x1.fafd4238f8063p-664, 0x1.720f4eaaf4bbbp-658 }, + { 0x1.5ff18a8317f0ap-664, 0x1.00f9a5fe04069p-658 }, + { 0x1.e8912b5139031p-665, 0x1.64d9f8b065b73p-659 }, + { 0x1.531288f8c01c7p-665, 0x1.ef7c38ee94e41p-660 }, + { 0x1.d695a98770e4bp-666, 0x1.57f251e86550ep-660 }, + { 0x1.46833ee262b1p-666, 0x1.dd73492689d2p-661 }, + { 0x1.c50b006d4e015p-667, 0x1.4b58b5eba6cc7p-661 }, + { 0x1.3a43cc572b3d3p-667, 0x1.cbd8e7539eac7p-662 }, + { 0x1.b3f14799b1616p-668, 0x1.3f0d6044b145dp-662 }, + { 0x1.2e5432e458097p-668, 0x1.baad518e7426ep-663 }, + { 0x1.a3486c40b74f1p-669, 0x1.33106d7f3cac9p-663 }, + { 0x1.22b456b1a8db7p-669, 0x1.a9f09adee91e3p-664 }, + { 0x1.931032d667261p-670, 0x1.2761dc408f1efp-664 }, + { 0x1.1763ffacc46acp-670, 0x1.99a2acce5bd7fp-665 }, + { 0x1.834838ba6fe3dp-671, 0x1.1c018e67b6eaep-665 }, + { 0x1.0c62daba74e7cp-671, 0x1.89c349043d67ep-666 }, + { 0x1.73eff5eb5eca5p-672, 0x1.10ef4a3481a29p-666 }, + { 0x1.01b07aeca1f42p-672, 0x1.7a520aeb63faep-667 }, + { 0x1.6506bebfc67bdp-673, 0x1.062abb7415c63p-667 }, + { 0x1.ee98b577ea7cap-674, 0x1.6b4e695e9099fp-668 }, + { 0x1.568bc5a3d72eep-674, 0x1.f766e96435041p-669 }, + { 0x1.da6bba883d22ap-675, 0x1.5cb7b85aa6067p-669 }, + { 0x1.487e1cd9f3e43p-675, 0x1.e311e0dabf963p-670 }, + { 0x1.c6d89f0368fc1p-676, 0x1.4e8d2ab5187d6p-670 }, + { 0x1.3adcb83cdccc3p-676, 0x1.cf55249e0172ap-671 }, + { 0x1.b3ddd3216f86ep-677, 0x1.40cdd3d52967cp-671 }, + { 0x1.2da66f0214306p-677, 0x1.bc2f50c60488ep-672 }, + { 0x1.a1799fd5925f4p-678, 0x1.3378a96e8e29ap-672 }, + { 0x1.20d9fd7b31257p-678, 0x1.a99ed8a2f2e6bp-673 }, + { 0x1.8faa294857a39p-679, 0x1.268c853c2e48dp-673 }, + { 0x1.147606d4e1ee3p-679, 0x1.97a2092e9b19dp-674 }, + { 0x1.7e6d714d6fce7p-680, 0x1.1a0826b9b2f1ep-674 }, + { 0x1.087916d26f37cp-680, 0x1.86370b7b69b46p-675 }, + { 0x1.6dc159d3dbce3p-681, 0x1.0dea34dab05c3p-675 }, + { 0x1.f9c3470942341p-682, 0x1.755be71f29feap-676 }, + { 0x1.5da3a74ec8bc7p-682, 0x1.02313fbe40a01p-676 }, + { 0x1.e35c1df5edf07p-683, 0x1.650e8497f58cdp-677 }, + { 0x1.4e120315adc06p-683, 0x1.edb784bbee452p-678 }, + { 0x1.cdb951dc67cbfp-684, 0x1.554cafa9d0c34p-678 }, + { 0x1.3f09fdba5037ep-684, 0x1.d7d0486e476ccp-679 }, + { 0x1.b8d760c6a3faap-685, 0x1.461419b3892c2p-679 }, + { 0x1.308911536a23dp-685, 0x1.c2a975dad9bep-680 }, + { 0x1.a4b2aa8c000cap-686, 0x1.37625bf981bdbp-680 }, + { 0x1.228ca3bac6e07p-686, 0x1.ae3f97cbb25cep-681 }, + { 0x1.914773f3bbbacp-687, 0x1.2934f9e530badp-681 }, + { 0x1.151208bdc254ep-687, 0x1.9a8f1bb2e0d78p-682 }, + { 0x1.7e91e9c37a26bp-688, 0x1.1b8963382a86p-682 }, + { 0x1.0816843f2edd8p-688, 0x1.879454bd5bf1ap-683 }, + { 0x1.6c8e23b87885fp-689, 0x1.0e5cf631ac83bp-683 }, + { 0x1.f72e98937c4f8p-690, 0x1.754b7ed21d736p-684 }, + { 0x1.5b38276a48eap-690, 0x1.01ad01a5b2ddp-684 }, + { 0x1.df23162441e8bp-691, 0x1.63b0c17c2afp-685 }, + { 0x1.4a8beb16012edp-691, 0x1.eaed8e09770edp-686 }, + { 0x1.c804c1d0522ebp-692, 0x1.52c032be62aabp-686 }, + { 0x1.3a855850eeeeap-692, 0x1.d36ef8a6e08fap-687 }, + { 0x1.b1cdcc2ca0214p-693, 0x1.4275d9d00481dp-687 }, + { 0x1.2b204ea20186ep-693, 0x1.bcd89c2310d59p-688 }, + { 0x1.9c78595e362cep-694, 0x1.32cdb1c10f0eep-688 }, + { 0x1.1c58a6013aaeep-694, 0x1.a724c21e93002p-689 }, + { 0x1.87fe848fd6bffp-695, 0x1.23c3ac05a8c19p-689 }, + { 0x1.0e2a313c94bb5p-695, 0x1.924da8624908p-690 }, + { 0x1.745a6341bd9d3p-696, 0x1.1553b2e7eba16p-690 }, + { 0x1.0090c041eb55fp-696, 0x1.7e4d844204d5fp-691 }, + { 0x1.61860872f36c7p-697, 0x1.0779abdf88654p-691 }, + { 0x1.e710449b20327p-698, 0x1.6b1e85d9cfdc3p-692 }, + { 0x1.4f7b87a3ccd22p-698, 0x1.f462f39da55f5p-693 }, + { 0x1.ce184ffaa0275p-699, 0x1.58badb2559681p-693 }, + { 0x1.3e34f7b15484dp-699, 0x1.daedfe49c8a9fp-694 }, + { 0x1.b6314a8f93441p-700, 0x1.471cb2f12adecp-694 }, + { 0x1.2dac75898461p-700, 0x1.c28c3fc94131bp-695 }, + { 0x1.9f52e6b0168fbp-701, 0x1.363e3fa56683p-695 }, + { 0x1.1ddc26b854422p-701, 0x1.ab358720f461fp-696 }, + { 0x1.8974e49b18481p-702, 0x1.2619b9e9f9276p-696 }, + { 0x1.0ebe3bcdc6652p-702, 0x1.94e1adf5ef17ap-697 }, + { 0x1.748f15c14a99p-703, 0x1.16a96324493c1p-697 }, + { 0x1.004cf29d383afp-703, 0x1.7f889bf8109c7p-698 }, + { 0x1.60995fd7916b4p-704, 0x1.07e787ce8decbp-698 }, + { 0x1.e50530acb7a2bp-705, 0x1.6b224a16aa4ep-699 }, + { 0x1.4d8bbfb38c98p-705, 0x1.f39d03522ee6ep-700 }, + { 0x1.cab316f0b29dep-706, 0x1.57a6c57f8fed2p-700 }, + { 0x1.3b5e4bf3051bbp-706, 0x1.d8b1738bdcb74p-701 }, + { 0x1.b1987b3f62cd2p-707, 0x1.450e32693ba8dp-701 }, + { 0x1.2a09376f26716p-707, 0x1.bf0154de94403p-702 }, + { 0x1.99aa6a5f22416p-708, 0x1.3350cea8cd61ap-702 }, + { 0x1.1984d37c8d151p-708, 0x1.a681c1d2f0b94p-703 }, + { 0x1.82de1daeb9c47p-709, 0x1.2266f414ce57bp-703 }, + { 0x1.09c991f950457p-709, 0x1.8f27fe21c9591p-704 }, + { 0x1.6d28fdea9871ap-710, 0x1.12491ab5c17d9p-704 }, + { 0x1.f5a00e548f085p-711, 0x1.78e979aa0c9bep-705 }, + { 0x1.5880a5ae03598p-711, 0x1.02efdac5a4ff4p-705 }, + { 0x1.d921d6d1c821bp-712, 0x1.63bbd32217718p-706 }, + { 0x1.44dae3b23367bp-712, 0x1.e8a7dcff4677cp-707 }, + { 0x1.be0a394617721p-713, 0x1.4f94da865b2a3p-707 }, + { 0x1.322dbccd73cabp-713, 0x1.ccdc67829105bp-708 }, + { 0x1.a44b3f5ce9c8bp-714, 0x1.3c6a934743c05p-708 }, + { 0x1.206f6db46b93p-714, 0x1.b26f5afd4ebc9p-709 }, + { 0x1.8bd742e227a38p-715, 0x1.2a3336386b4d7p-709 }, + { 0x1.0f966c7fd2396p-715, 0x1.99530a15ce61ap-710 }, + { 0x1.74a0efc06d36ep-716, 0x1.18e533433f227p-710 }, + { 0x1.ff32d3f1c0a49p-717, 0x1.817a166d90dbdp-711 }, + { 0x1.5e9b45aff1bep-717, 0x1.087732df4f3abp-711 }, + { 0x1.e0dea55db81c4p-718, 0x1.6ad7728d6db01p-712 }, + { 0x1.49b9999981d6cp-718, 0x1.f1c02ea5235f3p-713 }, + { 0x1.c41e9fb058b1ep-719, 0x1.555e63841a093p-713 }, + { 0x1.35ef96b0fe655p-719, 0x1.d42dfb77e321ep-714 }, + { 0x1.a8e19002cb47fp-720, 0x1.4102823a6a0a2p-714 }, + { 0x1.23313f4adb099p-720, 0x1.b8267dd51660dp-715 }, + { 0x1.8f16bf19917acp-721, 0x1.2db7bc80b123ep-715 }, + { 0x1.1172ed701cd4p-721, 0x1.9d98e007ff597p-716 }, + { 0x1.76adf2095d808p-722, 0x1.1b7255d8af1cep-716 }, + { 0x1.00a953345bce4p-722, 0x1.8474c5f89cf1fp-717 }, + { 0x1.5f976a86ba7a3p-723, 0x1.0a26e7ff7c8ap-717 }, + { 0x1.e192f5a290a0dp-724, 0x1.6caa4dc34bcc6p-718 }, + { 0x1.49c3e6e576cf8p-724, 0x1.f394c675d5da1p-719 }, + { 0x1.c3918d16606afp-725, 0x1.562a0ffd36fefp-719 }, + { 0x1.3524a1ccb90cep-725, 0x1.d4a41cdb95576p-720 }, + { 0x1.a739e0c3f00b3p-726, 0x1.40e51faa74ee4p-720 }, + { 0x1.21ab51a49a64p-726, 0x1.b7670ded07be7p-721 }, + { 0x1.8c781323e2b8bp-727, 0x1.2ccd09eaa341p-721 }, + { 0x1.0f4a27c210b83p-727, 0x1.9bc980b6cd88bp-722 }, + { 0x1.7338f3cfd4b18p-728, 0x1.19d3d560c7458p-722 }, + { 0x1.fbe79eabbab8bp-729, 0x1.81b807901b2ddp-723 }, + { 0x1.5b69fdd784131p-729, 0x1.07ec015b26bbfp-723 }, + { 0x1.db36d8463b3e1p-730, 0x1.691fdebe382bep-724 }, + { 0x1.44f955c9776f6p-730, 0x1.ee11097f70374p-725 }, + { 0x1.bc693203fe92cp-731, 0x1.51eeeac7320bep-725 }, + { 0x1.2fd5c7756dd24p-731, 0x1.ce39998362bf9p-726 }, + { 0x1.9f66cc65fb2cbp-732, 0x1.3c13b67a17ff2p-726 }, + { 0x1.1beec36eb8502p-732, 0x1.b03976c943068p-727 }, + { 0x1.8418af0dd65edp-733, 0x1.277d70b2ebc6fp-727 }, + { 0x1.09345c546e7cdp-733, 0x1.93f94ba2c6b6ap-728 }, + { 0x1.6a68c4bfd764bp-734, 0x1.141be9e049453p-728 }, + { 0x1.ef2e87ca7b717p-735, 0x1.7962a50231832p-729 }, + { 0x1.5241d71eb6e19p-735, 0x1.01df915097b64p-729 }, + { 0x1.ce118fc8beeeap-736, 0x1.605fee84767fp-730 }, + { 0x1.3b8f8a28fd848p-736, 0x1.e172e498cd2fcp-731 }, + { 0x1.aef59daa19c93p-737, 0x1.48dc6e3757e71p-731 }, + { 0x1.263e577f574dp-737, 0x1.c1366206ca036p-732 }, + { 0x1.91bfa9231de5cp-738, 0x1.32c440230ef3ap-732 }, + { 0x1.123b897af1af4p-738, 0x1.a2ee0ea25a216p-733 }, + { 0x1.7655cd85a2773p-739, 0x1.1e04519eb8f87p-733 }, + { 0x1.feea6c3554149p-740, 0x1.867f82bdccb8fp-734 }, + { 0x1.5c9f427a491a4p-740, 0x1.0a8a5c7678dffp-734 }, + { 0x1.dbb4739afff2ep-741, 0x1.6bd1744d1513ep-735 }, + { 0x1.4484548d479a3p-741, 0x1.f089c3d3d8b6fp-736 }, + { 0x1.bab46440d8e4bp-742, 0x1.52cbafb8bc99fp-736 }, + { 0x1.2dee5d96e696ep-742, 0x1.ce464b1286c0dp-737 }, + { 0x1.9bcaf0aad775cp-743, 0x1.3b571085ef9dbp-737 }, + { 0x1.18c7bd07b007fp-743, 0x1.ae2a4fedee59cp-738 }, + { 0x1.7eda37d26ae66p-744, 0x1.255d79dbe3905p-738 }, + { 0x1.04fbd01fd3b9ap-744, 0x1.9017432798e26p-739 }, + { 0x1.63c5ba199716fp-745, 0x1.10c9ceee61d28p-739 }, + { 0x1.e4edd431a7a4p-746, 0x1.73effa34f57abp-740 }, + { 0x1.4a724e2f6eadep-746, 0x1.fb0fd6a99ec28p-741 }, + { 0x1.c24c9890314cdp-747, 0x1.5998a4600495bp-741 }, + { 0x1.32c615eef6a3dp-747, 0x1.d70936a92f04ap-742 }, + { 0x1.a1f03c81340fdp-748, 0x1.40f6bfdad1f14p-742 }, + { 0x1.1ca87340e1c39p-748, 0x1.b55b284add8c1p-743 }, + { 0x1.83b6cbf2ba29fp-749, 0x1.29f10ece9036ep-743 }, + { 0x1.0801fd07f7284p-749, 0x1.95e2d86ae92c8p-744 }, + { 0x1.677ffffc31b92p-750, 0x1.146f8c6e8dc57p-744 }, + { 0x1.e978e83ebd95dp-751, 0x1.787f26e598ebbp-745 }, + { 0x1.4d2d2f5dd4096p-751, 0x1.005b6216a17eap-745 }, + { 0x1.c58570e2f641dp-752, 0x1.5d10973fbab06p-746 }, + { 0x1.34a13f272cdfap-752, 0x1.db3db8f832a58p-747 }, + { 0x1.a4017c5ace0dep-753, 0x1.4379416dfac63p-747 }, + { 0x1.1dc0938cfb932p-753, 0x1.b84ac1ef46255p-748 }, + { 0x1.84c7064147f81p-754, 0x1.2b9cc2c3d6738p-748 }, + { 0x1.087100f5e6429p-754, 0x1.97b6c5dc3637ap-749 }, + { 0x1.67b20873fc995p-755, 0x1.15602f1227af8p-749 }, + { 0x1.e9337a8979dap-756, 0x1.795cb2bb480b6p-750 }, + { 0x1.4ca0667456eb8p-756, 0x1.00aa01fc8a73ep-750 }, + { 0x1.c446a2ccade1cp-757, 0x1.5d196927cdaccp-751 }, + { 0x1.3371d92c55c69p-757, 0x1.dac421184af19p-752 }, + { 0x1.a1ef1650d3562p-758, 0x1.42cba823b93cbp-752 }, + { 0x1.1c07db1df4cf6p-758, 0x1.b6e2f60b615c1p-753 }, + { 0x1.8202debc2593cp-759, 0x1.2a53f94211ba9p-753 }, + { 0x1.064595037ce7bp-759, 0x1.95853e0fd75adp-754 }, + { 0x1.645a58ac6913cp-760, 0x1.13949d3b2fbd2p-754 }, + { 0x1.e41f95cc492cep-761, 0x1.768213ee2ba9cp-755 }, + { 0x1.48d0194e5b153p-761, 0x1.fce2f1e195a7ap-756 }, + { 0x1.be99935f38c42p-762, 0x1.59b2d772c1b04p-756 }, + { 0x1.2f40d4a5d287p-762, 0x1.d5a005ce1b15dp-757 }, + { 0x1.9bc8aa74c3805p-763, 0x1.3ef3138f8ae58p-757 }, + { 0x1.178b448b82b16p-763, 0x1.b12e626e3c8a1p-758 }, + { 0x1.7b7f2dc7fa066p-764, 0x1.2620652c3102cp-758 }, + { 0x1.0190106456396p-764, 0x1.8f5ecffd9c995p-759 }, + { 0x1.5d92194746ef2p-765, 0x1.0f1a62a97a48ep-759 }, + { 0x1.da636b2add63ap-766, 0x1.7004d0a0dd3fcp-760 }, + { 0x1.41d8f14e2d235p-766, 0x1.f38508375a815p-761 }, + { 0x1.b4a8e16df3a2ep-767, 0x1.52f67f4a45dbdp-761 }, + { 0x1.282da2ee06e9fp-767, 0x1.cbf8187da97p-762 }, + { 0x1.91bc4f0e82a1p-768, 0x1.380c6fa6ddd1bp-762 }, + { 0x1.106c65473611bp-768, 0x1.a757e44dde4fbp-763 }, + { 0x1.716ca73d3a1dcp-769, 0x1.1f218f165083cp-763 }, + { 0x1.f4e737e667fe6p-770, 0x1.8571975a9ba0cp-764 }, + { 0x1.538bdbc88035p-770, 0x1.081306aee058bp-764 }, + { 0x1.cc4774fe05a13p-771, 0x1.661571375ee31p-765 }, + { 0x1.37eeb586702afp-771, 0x1.e5803c9b677cp-766 }, + { 0x1.a6be51e94d2c3p-772, 0x1.49169d29f057fp-766 }, + { 0x1.1e6cae3cc5ce4p-772, 0x1.be144165bfdadp-767 }, + { 0x1.841452e30c6ecp-773, 0x1.2e4b0b7596d86p-767 }, + { 0x1.06dfcc0330324p-773, 0x1.99a8814f82396p-768 }, + { 0x1.64157d8dbcaa1p-774, 0x1.158b4c1d7aa61p-768 }, + { 0x1.e248fc3725278p-775, 0x1.7806fe5adc0dep-769 }, + { 0x1.4691284199248p-775, 0x1.fd64d63539ac4p-770 }, + { 0x1.ba32f675bcca1p-776, 0x1.58fd2560c98e3p-770 }, + { 0x1.2b59cb5fcd07p-776, 0x1.d33b9c01b8858p-771 }, + { 0x1.953f4278d9771p-777, 0x1.3c5b9e7be019ep-771 }, + { 0x1.1244d4a198783p-777, 0x1.ac5a261b57bd2p-772 }, + { 0x1.7333ac721d353p-778, 0x1.21f61f6e6a3a5p-772 }, + { 0x1.f654f8b2c9938p-779, 0x1.8883e334bf813p-773 }, + { 0x1.53d9d5f4e3889p-779, 0x1.09a33ffab8174p-773 }, + { 0x1.cbcb3935e8707p-780, 0x1.678037d69a88ap-774 }, + { 0x1.36fefd85e37f7p-780, 0x1.e678a0474dd4dp-775 }, + { 0x1.a4a7147e53789p-781, 0x1.491a44a8cc267p-775 }, + { 0x1.1c73c8c2f3143p-781, 0x1.bd3a60953bab8p-776 }, + { 0x1.80a7df6e9e4abp-782, 0x1.2d20af56e98e4p-776 }, + { 0x1.040c111171b21p-782, 0x1.9748563f2a02cp-777 }, + { 0x1.5f9153468350dp-783, 0x1.13656dff66048p-777 }, + { 0x1.db3d65827b6f1p-784, 0x1.7463a2ae57157p-778 }, + { 0x1.412b4a3b0b6bbp-784, 0x1.f77b2a384d071p-779 }, + { 0x1.b20abd232bd72p-785, 0x1.5451ae34b02aep-779 }, + { 0x1.25417f5fe18aap-785, 0x1.cc024fa52d21ep-780 }, + { 0x1.8c38db09c3d68p-786, 0x1.36dbe645ba702p-780 }, + { 0x1.0ba351c6b2c44p-786, 0x1.a415d531b6e85p-781 }, + { 0x1.69856de02317p-787, 0x1.1bcf7eeeba2f5p-781 }, + { 0x1.e847157246bfcp-788, 0x1.7f70703ac5558p-782 }, + { 0x1.49b2d16422141p-788, 0x1.02fd377359b1p-782 }, + { 0x1.bd304de355d85p-789, 0x1.5dd1b0bb84b26p-783 }, + { 0x1.2c87c2ff697dcp-789, 0x1.d87243e77ecadp-784 }, + { 0x1.95b4456f24a66p-790, 0x1.3efdb3b369292p-784 }, + { 0x1.11cf1a60f1d84p-790, 0x1.aeb4dc01a4631p-785 }, + { 0x1.718a9184a8678p-791, 0x1.22bcd99dbdb06p-785 }, + { 0x1.f2af0be1fde49p-792, 0x1.88766c06b0833p-786 }, + { 0x1.507007917e3d9p-792, 0x1.08db80d427d79p-786 }, + { 0x1.c5e695f15072bp-793, 0x1.65709eb54bf5ep-787 }, + { 0x1.32266540e08c2p-793, 0x1.e253876b38acep-788 }, + { 0x1.9cf012acb820bp-794, 0x1.45623a2f6a451p-788 }, + { 0x1.1673fda512b46p-794, 0x1.b6f674d703273p-789 }, + { 0x1.777d05328bd26p-795, 0x1.280eca736b4b1p-789 }, + { 0x1.fa46d62b8e57dp-796, 0x1.8f4d804e3ad6fp-790 }, + { 0x1.5544c8bc23e1cp-796, 0x1.0d3e50a2eecdcp-790 }, + { 0x1.cc068b1dc8ab2p-797, 0x1.6b0c7763ce52bp-791 }, + { 0x1.36042b906571p-797, 0x1.e979edc5b3767p-792 }, + { 0x1.a1cbbab815b4cp-798, 0x1.49ecd657d5dd6p-792 }, + { 0x1.197d0fe71564cp-798, 0x1.bcb59141dc715p-793 }, + { 0x1.7b41f3bcb1869p-799, 0x1.2bad65a82bb23p-793 }, + { 0x1.feec24eca8006p-800, 0x1.93d6de18ac6bfp-794 }, + { 0x1.581b387627669p-800, 0x1.1011dd6dfecf6p-794 }, + { 0x1.cf746ccaba032p-801, 0x1.6e8be31f2fe24p-795 }, + { 0x1.380f8b864e1acp-801, 0x1.edc51c8649aaap-796 }, + { 0x1.a4312cc2f816ap-802, 0x1.4c88f43732a1p-796 }, + { 0x1.1adc83c96accfp-802, 0x1.bfd81ed74f1cdp-797 }, + { 0x1.7cc835281bbf3p-803, 0x1.2d883a292df3bp-797 }, + { 0x1.0044e6f2b903fp-803, 0x1.95fde403b5724p-798 }, + { 0x1.58e66674c0f82p-804, 0x1.11494966870b7p-798 }, + { 0x1.d0209514d613dp-805, 0x1.6fdef1ca550b3p-799 }, + { 0x1.383f2f4495aedp-805, 0x1.ef217eb67d36dp-800 }, + { 0x1.a41575f0363d6p-806, 0x1.4d2aaa5b8e28ap-800 }, + { 0x1.1a8c12a0cae91p-806, 0x1.c04fcbf1fddd8p-801 }, + { 0x1.7c08d08f2ccbbp-807, 0x1.2d96cdd2a30b8p-801 }, + { 0x1.ff186c5b90604p-808, 0x1.95b8ba50a2687p-802 }, + { 0x1.57a2b0b1c4c86p-808, 0x1.10df03cd711e3p-802 }, + { 0x1.ce07ef98af2aep-809, 0x1.6eff939f51c8fp-803 }, + { 0x1.36923c5eb270bp-809, 0x1.ed88d96607fb4p-804 }, + { 0x1.a1791489717bfp-810, 0x1.4bcf1445c1d61p-804 }, + { 0x1.188d2c2d680a3p-810, 0x1.be1a747b458c8p-805 }, + { 0x1.7907312c7e255p-811, 0x1.2bd8dde16ba8ap-805 }, + { 0x1.fa9e995f4c414p-812, 0x1.93089dc23e417p-806 }, + { 0x1.5455df149c7b5p-812, 0x1.0ed4f34d6e965p-806 }, + { 0x1.c93410e8142f8p-813, 0x1.6bf1c754a3325p-807 }, + { 0x1.33105a5b594f7p-813, 0x1.e9027b1c5a4abp-808 }, + { 0x1.9c67f441e11b3p-814, 0x1.487c687197597p-808 }, + { 0x1.14e8ebae7496ep-814, 0x1.b942323a72767p-809 }, + { 0x1.73d10c597b774p-815, 0x1.285660efb3e9ap-809 }, + { 0x1.f330b99c7f9e7p-816, 0x1.8df9d62fb9c5ep-810 }, + { 0x1.4f0ef77c81a6fp-816, 0x1.0b34677fe9486p-810 }, + { 0x1.c1baedb5f2e65p-817, 0x1.66c37bb05de1ep-811 }, + { 0x1.2dc9788ad9864p-817, 0x1.e1a30436bcde5p-812 }, + { 0x1.94f913add4907p-818, 0x1.4341c90c553e7p-812 }, + { 0x1.0fafd2c40ba27p-818, 0x1.b1dd0ffc5d04bp-813 }, + { 0x1.6c7df995241d1p-819, 0x1.231f4a6757469p-813 }, + { 0x1.e8f062cc963cep-820, 0x1.86a35930ed5e1p-814 }, + { 0x1.47e5cbff0d92ep-820, 0x1.060dd236f49a3p-814 }, + { 0x1.b7be34be4e18dp-821, 0x1.5f8c25cd122d7p-815 }, + { 0x1.26d5559b935e7p-821, 0x1.d78bca82e9f37p-816 }, + { 0x1.8b4dd6af9c05dp-822, 0x1.3c36d15093021p-816 }, + { 0x1.08f94cfc79158p-822, 0x1.a80c62c44a65bp-817 }, + { 0x1.632ec0e0d009cp-823, 0x1.1c4b11ed6627ap-817 }, + { 0x1.dc0b5f2e40ea4p-824, 0x1.7d261cc2edf72p-818 }, + { 0x1.3efa480ea698bp-824, 0x1.fef096f5252fp-819 }, + { 0x1.ab6a5245de9e5p-825, 0x1.566c107178d1fp-819 }, + { 0x1.1e52cde409267p-825, 0x1.cae9de8f00c0bp-820 }, + { 0x1.7f910d0084829p-826, 0x1.337ae444bd293p-820 }, + { 0x1.00e3012bd4171p-826, 0x1.9bfbcfe9dc1e8p-821 }, + { 0x1.580c66bfc7cf5p-827, 0x1.13f803c0631d9p-821 }, + { 0x1.ccba595fe34b5p-828, 0x1.71ac2109d33c9p-822 }, + { 0x1.347383dcf4a9bp-828, 0x1.ef21caa7d80c3p-823 }, + { 0x1.9cf52785fcd1fp-829, 0x1.4b8b6bbdb7a4fp-823 }, + { 0x1.1466f7a4ba4b3p-829, 0x1.bbf4bcf8ca0c3p-824 }, + { 0x1.71f5b701cb667p-830, 0x1.2934441fdae8bp-824 }, + { 0x1.ef1fef5338f87p-831, 0x1.8de00a5d4cff3p-825 }, + { 0x1.4b46ffc2e70ccp-831, 0x1.0a4a61359d63ap-825 }, + { 0x1.bb3f3e667d5e5p-832, 0x1.64673b39bdd54p-826 }, + { 0x1.287ea78b8278fp-832, 0x1.dcf3acd0cc1f4p-827 }, + { 0x1.8c9c8347a2863p-833, 0x1.3f1926f0c2aa4p-827 }, + { 0x1.093c166d47d9p-833, 0x1.aaecb94ca24e1p-828 }, + { 0x1.62b5957e6b822p-834, 0x1.1d8efbbc88d6cp-828 }, + { 0x1.da4f3c5b8c56fp-835, 0x1.7df554174928cp-829 }, + { 0x1.3d1457a1afdaep-835, 0x1.fed6b4a9440a8p-830 }, + { 0x1.a7e3665ffae25p-836, 0x1.558fae0fed7aap-830 }, + { 0x1.1b4da97b89113p-836, 0x1.c8b307e047613p-831 }, + { 0x1.7aa46b2ec675cp-837, 0x1.3149a005e5984p-831 }, + { 0x1.fa00e080e536p-838, 0x1.9819329634547p-832 }, + { 0x1.520f92dcad4a2p-838, 0x1.10bba52994e8ep-832 }, + { 0x1.c3a9666328faap-839, 0x1.6c7dd2d93c0f9p-833 }, + { 0x1.2dae795ce73b6p-839, 0x1.e70fd5d6d806dp-834 }, + { 0x1.92f5963d343cfp-840, 0x1.45629dffe1fa7p-834 }, + { 0x1.0d15f439254bep-840, 0x1.b2b2e959996bp-835 }, + { 0x1.675546ac2c967p-841, 0x1.2255364dfcfd7p-835 }, + { 0x1.dfca1ff236f02p-842, 0x1.83c6a3841fccap-836 }, + { 0x1.4046155930cfbp-842, 0x1.02ee197efc99dp-836 }, + { 0x1.ab8846c89a496p-843, 0x1.59bfc8bdbfffep-837 }, + { 0x1.1d5226b496f7ep-843, 0x1.cd9f4c973304p-838 }, + { 0x1.7cc7edd2bedd1p-844, 0x1.3420703d360eap-838 }, + { 0x1.fc1e021531b11p-845, 0x1.9b4a6e4580455p-839 }, + { 0x1.52f9fd29afa7bp-845, 0x1.1276cde31355ep-839 }, + { 0x1.c439018f9e7bp-846, 0x1.6e44a0da72dedp-840 }, + { 0x1.2d9d4a3bfacfap-846, 0x1.e8b82d35e9882p-841 }, + { 0x1.9247c7d6b7109p-847, 0x1.4603c1a2de688p-841 }, + { 0x1.0c3d4d5746632p-847, 0x1.b2e6fa531d555p-842 }, + { 0x1.65add59367765p-848, 0x1.220b241172407p-842 }, + { 0x1.dce1e8301e6efp-849, 0x1.82d28ae825549p-843 }, + { 0x1.3dde18cb97a8dp-849, 0x1.01ea51e3f541cp-843 }, + { 0x1.a7b31ccb0b2f4p-850, 0x1.57e3d8e31e749p-844 }, + { 0x1.1a59798dd7aa2p-850, 0x1.ca77ce984ce61p-845 }, + { 0x1.7843a7981f8e3p-851, 0x1.3192c63185ef2p-845 }, + { 0x1.f55b0f3ffe463p-852, 0x1.974911a73b1a7p-846 }, + { 0x1.4df9fe655b0fbp-852, 0x1.0f64b579273f6p-846 }, + { 0x1.bce68ce6bcfedp-853, 0x1.69a3e1bad13dap-847 }, + { 0x1.284bfe1cdea24p-853, 0x1.e1d6859c11527p-848 }, + { 0x1.8a9c29acbf47dp-854, 0x1.40f425a16dca3p-848 }, + { 0x1.06bd70b72892bp-854, 0x1.ab8633790b1e2p-849 }, + { 0x1.5dd55c1a48477p-855, 0x1.1cb4a43b9229fp-849 }, + { 0x1.d1bd6b173b9f2p-856, 0x1.7b25cc6523c3bp-850 }, + { 0x1.35fc8451ff49ep-856, 0x1.f8db2dc70232bp-851 }, + { 0x1.9c9712232f548p-857, 0x1.5014bc06e7f91p-851 }, + { 0x1.128b47439dcd5p-857, 0x1.bf66ba3b9066cp-852 }, + { 0x1.6d53d2be0a0b6p-858, 0x1.29c2c1dc958dbp-852 }, + { 0x1.e6122171333dfp-859, 0x1.8c4a9d76af90fp-853 }, + { 0x1.435229d0cc681p-859, 0x1.07ae5a7347d0bp-853 }, + { 0x1.ae1371b74ea2dp-860, 0x1.5ed9539dfd0c9p-854 }, + { 0x1.1e01427183001p-860, 0x1.d2c69c7599edcp-855 }, + { 0x1.7c589442700ecp-861, 0x1.3677341a98a13p-855 }, + { 0x1.f9be9e1d7b4e4p-862, 0x1.9cf2c5625685ep-856 }, + { 0x1.5033c96eb757p-862, 0x1.1298aebe8af0fp-856 }, + { 0x1.bef014f36ffa9p-863, 0x1.6d2655c8560ebp-857 }, + { 0x1.290979be09b3bp-863, 0x1.e58166789d0bcp-858 }, + { 0x1.8ac6ba86dcc3cp-864, 0x1.42b9e90b536b6p-858 }, + { 0x1.064e638fb2517p-864, 0x1.acfe7e64002b1p-859 }, + { 0x1.5c884857d8adep-865, 0x1.1d179e12ade6ep-859 }, + { 0x1.cf0beaeb1b319p-866, 0x1.7ae01eb0f55cbp-860 }, + { 0x1.338e29511ffcdp-866, 0x1.f772a9e0423a1p-861 }, + { 0x1.9881a23b2ff9bp-867, 0x1.4e72e15f0f016p-861 }, + { 0x1.0f43798c4f845p-867, 0x1.bc4e2f5a8c9afp-862 }, + { 0x1.6836e63bd7d88p-868, 0x1.27165d875ec78p-862 }, + { 0x1.de466f9c32fdap-869, 0x1.87eb54ae1860dp-863 }, + { 0x1.3d79f883687bfp-869, 0x1.043b38d103ec9p-863 }, + { 0x1.a56d48500b8a3p-870, 0x1.598a7d65e3b67p-864 }, + { 0x1.17ac327f9b5e5p-870, 0x1.cac2d1ee89db1p-865 }, + { 0x1.73278f241bb95p-871, 0x1.308090afcd9f3p-865 }, + { 0x1.ec801820c3f3dp-872, 0x1.942d41e7bf2a3p-866 }, + { 0x1.46b841565ab3ep-872, 0x1.0c34dc595f4bfp-866 }, + { 0x1.b16ea850bfa34p-873, 0x1.63e9cb83e74b2p-867 }, + { 0x1.1f76e44abf0ecp-873, 0x1.d83e5a3ffd7adp-868 }, + { 0x1.7d432d7dd0ca1p-874, 0x1.39428e0fd00c5p-868 }, + { 0x1.f99abec00b682p-875, 0x1.9f8c2eadfb109p-869 }, + { 0x1.4f35579392d4bp-875, 0x1.13957092e7741p-869 }, + { 0x1.bc6c19eee10e8p-876, 0x1.6d7ad6ac744f9p-870 }, + { 0x1.2692d6adc530fp-876, 0x1.e4a41e3c393c2p-871 }, + { 0x1.8673fad41c337p-877, 0x1.4149a31665d1ep-871 }, + { 0x1.02bd066e6e446p-877, 0x1.a9efbad7c9909p-872 }, + { 0x1.56dece3f159c3p-878, 0x1.1a4d14ca40e6p-872 }, + { 0x1.c64dabfd6babdp-879, 0x1.7628f37011dc7p-873 }, + { 0x1.2cf07ed3ac7cap-879, 0x1.efd93aae49244p-874 }, + { 0x1.8ea5cdb1b77f8p-880, 0x1.4884565714d83p-874 }, + { 0x1.0801f05da3babp-880, 0x1.b341347ab9d2ep-875 }, + { 0x1.5da3ba0723cbcp-881, 0x1.204d0f497ca7dp-875 }, + { 0x1.cefd7b19fc691p-882, 0x1.7de10a24a9be3p-876 }, + { 0x1.3281b7ca3d771p-882, 0x1.f9c4f419d97b9p-877 }, + { 0x1.95c663259c5d8p-883, 0x1.4ee2a6bb63f1dp-877 }, + { 0x1.0c90568fe453bp-883, 0x1.bb6bea4d790c6p-878 }, + { 0x1.6374ef6370a23p-884, 0x1.258802fee3a1bp-878 }, + { 0x1.d668024e6e773p-885, 0x1.8491dcb50d65p-879 }, + { 0x1.3739f6c74a992p-885, 0x1.012888bcf5e1bp-879 }, + { 0x1.9bc5a2748239p-886, 0x1.5456466d99824p-880 }, + { 0x1.105de86fb726ep-886, 0x1.c25d7813e5a28p-881 }, + { 0x1.68453b252f9afp-887, 0x1.29f220ff323bdp-881 }, + { 0x1.dc7c640bf856fp-888, 0x1.8a2c46b36447dp-882 }, + { 0x1.3b0e7a2d8004dp-888, 0x1.04b5178932d9ep-882 }, + { 0x1.a095d99893beap-889, 0x1.58d2d04dcdef9p-883 }, + { 0x1.1361f24d04a1ep-889, 0x1.c8060b8a624d8p-884 }, + { 0x1.6c0994513d45bp-890, 0x1.2d8154e3020f5p-884 }, + { 0x1.e12caa0268707p-891, 0x1.8ea37661d565fp-885 }, + { 0x1.3df6725a60cf5p-891, 0x1.078003d294269p-885 }, + { 0x1.a42bf15180a09p-892, 0x1.5c4df6da1a5fp-886 }, + { 0x1.15957e82800c6p-892, 0x1.cc58a0676d26ep-887 }, + { 0x1.6eb9463d29a0dp-893, 0x1.302d6b1661efp-887 }, + { 0x1.e46dfa81a2018p-894, 0x1.91ed1d851d1ddp-888 }, + { 0x1.3feb236502138p-894, 0x1.0982d94421652p-888 }, + { 0x1.a67f97b02e026p-895, 0x1.5ebfab91b4a2bp-889 }, + { 0x1.16f37032d6085p-895, 0x1.cf4b3235443f5p-890 }, + { 0x1.704e120e656fdp-896, 0x1.31f0304f01ddbp-890 }, + { 0x1.e638c247f445dp-897, 0x1.940198fd0e1c2p-891 }, + { 0x1.40e7ff18c854cp-897, 0x1.0ab8eaa8fae67p-891 }, + { 0x1.a78b6039c7039p-898, 0x1.60223e0067b2cp-892 }, + { 0x1.1778970df4481p-898, 0x1.d0d6e2f89dd66p-893 }, + { 0x1.70c446e7535ccp-899, 0x1.32c589802b4bap-893 }, + { 0x1.e688d1dc06742p-900, 0x1.94dc0e4e3bd62p-894 }, + { 0x1.40eab69ffb357p-900, 0x1.0b1f64079cf15p-894 }, + { 0x1.a74cd8f49285bp-901, 0x1.607271cb1c23p-895 }, + { 0x1.1723bbb37e71p-901, 0x1.d0f815d3e30e4p-896 }, + { 0x1.701ad03f5aba2p-902, 0x1.32ab83cb1b9aap-896 }, + { 0x1.e55d6dd34aeb5p-903, 0x1.947a7e7d08e62p-897 }, + { 0x1.3ff3437e5e592p-903, 0x1.0ab555a059592p-897 }, + { 0x1.a5c493ec4b75bp-904, 0x1.5faf8b45ee11cp-898 }, + { 0x1.15f5a46f2a8c5p-904, 0x1.cfae7d166a387p-899 }, + { 0x1.6e533a1804da5p-905, 0x1.31a25c153692fp-899 }, + { 0x1.e2b951ac76b4bp-906, 0x1.92ddcdd3a585ap-900 }, + { 0x1.3e03e7aaf4a23p-906, 0x1.097bb793410b5p-900 }, + { 0x1.a2f624fa2da41p-907, 0x1.5ddb524f58124p-901 }, + { 0x1.13f112353b2e2p-907, 0x1.ccfd1b6b2b0d1p-902 }, + { 0x1.6b71aaf8395acp-908, 0x1.2fac7e1ac1a55p-902 }, + { 0x1.dea2a52e6f8d6p-909, 0x1.9009c068a7447p-903 }, + { 0x1.3b2124c85eb7dp-909, 0x1.077566199da13p-903 }, + { 0x1.9ee813dcc82f4p-910, 0x1.5afa0b60e30adp-904 }, + { 0x1.111ab5ef7d9cep-910, 0x1.c8ea38207b48cp-905 }, + { 0x1.677cd3ce598a2p-911, 0x1.2cce7b0334e93p-905 }, + { 0x1.d922e485849dfp-912, 0x1.8c04eb792831bp-906 }, + { 0x1.3751aaab95803p-912, 0x1.04a716678c7d9p-906 }, + { 0x1.99a3c2eb312dfp-913, 0x1.571266fb205e7p-907 }, + { 0x1.0d791e54efc95p-913, 0x1.c37f46c8a36cep-908 }, + { 0x1.627dd610c1f2fp-914, 0x1.290ef7aa6784ep-908 }, + { 0x1.d246bba093dddp-915, 0x1.86d89be61c44fp-909 }, + { 0x1.329e3d8fc35e5p-915, 0x1.011744722e8f8p-909 }, + { 0x1.93354aecb0f91p-916, 0x1.522d67c700dd9p-910 }, + { 0x1.09149eae599f4p-916, 0x1.bcc8c2b79e5e6p-911 }, + { 0x1.5c8020a89d6a7p-917, 0x1.247692feaf7c7p-911 }, + { 0x1.ca1dd59404578p-918, 0x1.8090b25f1fb1cp-912 }, + { 0x1.2d1194826d1d9p-918, 0x1.f99c33fa36826p-913 }, + { 0x1.8bab4cd7bc185p-919, 0x1.4c563ff8738edp-913 }, + { 0x1.03f72f0fa181cp-919, 0x1.b4d5ff233ee8bp-914 }, + { 0x1.559144638d7d2p-920, 0x1.1f0fc4fe41aefp-914 }, + { 0x1.c0baa10766979p-921, 0x1.793b75fbd2367p-915 }, + { 0x1.26b830bbc4f33p-921, 0x1.efaa9eeaa4992p-916 }, + { 0x1.8316ba6f8ef74p-922, 0x1.459a26ac43fcfp-916 }, + { 0x1.fc588d5eeb3p-923, 0x1.abb8ece685efep-917 }, + { 0x1.4dc0c0d42f863p-923, 0x1.18e6b704952c1p-917 }, + { 0x1.b6320aea7077ap-924, 0x1.70e95e366ca95p-918 }, + { 0x1.1fa02ebad6485p-924, 0x1.e4700e7fab75ep-919 }, + { 0x1.798a96e59845bp-925, 0x1.3e0826243926dp-919 }, + { 0x1.ef81624855ca5p-926, 0x1.a185d71d9ae78p-920 }, + { 0x1.451fcaaed5e7p-926, 0x1.1209163a43d8ap-920 }, + { 0x1.aa9b30dd7b333p-927, 0x1.67acd56555624p-921 }, + { 0x1.17d9121b4ff43p-927, 0x1.d805487b20ec2p-922 }, + { 0x1.6f1bb0c9eff18p-928, 0x1.35b0e3e76f72ap-922 }, + { 0x1.e184bec96bcc5p-929, 0x1.965317fc3f8ebp-923 }, + { 0x1.3bc10ccdff1d7p-929, 0x1.0a85e11600392p-923 }, + { 0x1.9e0f0cdf83a76p-930, 0x1.5d99f4f4fa7a2p-924 }, + { 0x1.0f738d3253e75p-930, 0x1.ca8538b911cc2p-925 }, + { 0x1.63e056b37b486p-931, 0x1.2ca663e8f6c6ep-925 }, + { 0x1.d2806afda0512p-932, 0x1.8a38c763ae5p-926 }, + { 0x1.31b865207923bp-932, 0x1.026d30f31261ep-926 }, + { 0x1.90a81bef15367p-933, 0x1.52c63cbe5201dp-927 }, + { 0x1.068145905baddp-933, 0x1.bc0c903e2dd51p-928 }, + { 0x1.57f0081c7461bp-934, 0x1.22fbc7eb40c8ep-928 }, + { 0x1.c293abfeb81c1p-935, 0x1.7d5064d5d2e6ap-929 }, + { 0x1.271a9ed146425p-935, 0x1.f3a001a1da12ap-930 }, + { 0x1.8282015bfd093p-936, 0x1.474846e880b8p-930 }, + { 0x1.fa292d1f4b615p-937, 0x1.acb96019278e3p-931 }, + { 0x1.4b6323fa7fafcp-937, 0x1.18c50c637e437p-931 }, + { 0x1.b1ded81f6cf48p-938, 0x1.6fb47e7243b1p-932 }, + { 0x1.1bfd2aff12d23p-938, 0x1.e17fe4af1cdcdp-933 }, + { 0x1.73b9288cf980bp-939, 0x1.3b3779cd081bcp-933 }, + { 0x1.e680a6315c8f9p-940, 0x1.9caab20737c4bp-934 }, + { 0x1.3e52969a46a03p-940, 0x1.0e16c42489121p-934 }, + { 0x1.a082ea93d471fp-941, 0x1.618056ad2fa0dp-935 }, + { 0x1.1075d9566cab2p-941, 0x1.ce9e247afa7efp-936 }, + { 0x1.646a66f6fb197p-942, 0x1.2eabb9557e4c3p-936 }, + { 0x1.d22f0f82317a8p-943, 0x1.8c0020c90fd02p-937 }, + { 0x1.30d7883df3e07p-943, 0x1.0305d4157bdecp-937 }, + { 0x1.8ea1187daf8b3p-944, 0x1.52cf8a69cbdeep-938 }, + { 0x1.049a91d747c02p-944, 0x1.bb1f3a4ce848cp-939 }, + { 0x1.54b29ff375e83p-945, 0x1.21bd19407d3a8p-939 }, + { 0x1.bd5a7cbaf896dp-946, 0x1.7ad97206eb3e9p-940 }, + { 0x1.230b0dec754dap-946, 0x1.ef4e6059f1fe4p-941 }, + { 0x1.7c5a693980a4p-947, 0x1.43bdb9112e65bp-941 }, + { 0x1.f10221f87a1cap-948, 0x1.a7278c0b2c815p-942 }, + { 0x1.44ae6c097e3b8p-948, 0x1.148391a9b5b7p-942 }, + { 0x1.a8288818abb4p-949, 0x1.69563388e87eep-943 }, +}, +}; diff --git a/math/aarch64/v_erfcf_data.c b/math/aarch64/v_erfcf_data.c new file mode 100644 index 000000000000..9f992b4887fb --- /dev/null +++ b/math/aarch64/v_erfcf_data.c @@ -0,0 +1,664 @@ +/* + * Data used in single-precision erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in vector erfcf. + For each possible rounded input r (multiples of 1/64), between + r = 0.0 and r = 10.0625 (645 values): + - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r), + - the second entry __v_erfcf_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore + they are scaled by a large enough value 2^47 (fits in 8 bits). */ +const struct v_erfcf_data __v_erfcf_data = { + .tab = { { 0x1p47, 0x1.20dd76p47 }, + { 0x1.f6f944p46, 0x1.20cb68p47 }, + { 0x1.edf3aap46, 0x1.209546p47 }, + { 0x1.e4f05p46, 0x1.203b26p47 }, + { 0x1.dbf056p46, 0x1.1fbd28p47 }, + { 0x1.d2f4dcp46, 0x1.1f1b7ap47 }, + { 0x1.c9fefep46, 0x1.1e565cp47 }, + { 0x1.c10fd4p46, 0x1.1d6e14p47 }, + { 0x1.b8287ap46, 0x1.1c62fap47 }, + { 0x1.af4ap46, 0x1.1b3572p47 }, + { 0x1.a6757ep46, 0x1.19e5eap47 }, + { 0x1.9dabfcp46, 0x1.1874dep47 }, + { 0x1.94ee88p46, 0x1.16e2d8p47 }, + { 0x1.8c3e24p46, 0x1.153068p47 }, + { 0x1.839bd6p46, 0x1.135e3p47 }, + { 0x1.7b0894p46, 0x1.116cd8p47 }, + { 0x1.728558p46, 0x1.0f5d16p47 }, + { 0x1.6a1312p46, 0x1.0d2fa6p47 }, + { 0x1.61b2acp46, 0x1.0ae55p47 }, + { 0x1.596508p46, 0x1.087ee4p47 }, + { 0x1.512b06p46, 0x1.05fd3ep47 }, + { 0x1.49057ap46, 0x1.03614p47 }, + { 0x1.40f536p46, 0x1.00abdp47 }, + { 0x1.38fbp46, 0x1.fbbbbep46 }, + { 0x1.311796p46, 0x1.f5f0cep46 }, + { 0x1.294bb4p46, 0x1.eff8c4p46 }, + { 0x1.21980ap46, 0x1.e9d5a8p46 }, + { 0x1.19fd3ep46, 0x1.e38988p46 }, + { 0x1.127bf2p46, 0x1.dd167cp46 }, + { 0x1.0b14bcp46, 0x1.d67ea2p46 }, + { 0x1.03c82ap46, 0x1.cfc41ep46 }, + { 0x1.f92d8cp45, 0x1.c8e91cp46 }, + { 0x1.eb0214p45, 0x1.c1efcap46 }, + { 0x1.dd0edap45, 0x1.bada5ap46 }, + { 0x1.cf54b4p45, 0x1.b3aafcp46 }, + { 0x1.c1d46ap45, 0x1.ac63e8p46 }, + { 0x1.b48eaep45, 0x1.a5074ep46 }, + { 0x1.a78428p45, 0x1.9d9762p46 }, + { 0x1.9ab566p45, 0x1.96165p46 }, + { 0x1.8e22eap45, 0x1.8e8646p46 }, + { 0x1.81cd24p45, 0x1.86e96ap46 }, + { 0x1.75b47p45, 0x1.7f41dcp46 }, + { 0x1.69d91ep45, 0x1.7791b8p46 }, + { 0x1.5e3b66p45, 0x1.6fdb12p46 }, + { 0x1.52db78p45, 0x1.681ff2p46 }, + { 0x1.47b96ep45, 0x1.60625cp46 }, + { 0x1.3cd554p45, 0x1.58a446p46 }, + { 0x1.322f26p45, 0x1.50e79ep46 }, + { 0x1.27c6d2p45, 0x1.492e42p46 }, + { 0x1.1d9c34p45, 0x1.417a0cp46 }, + { 0x1.13af1ep45, 0x1.39ccc2p46 }, + { 0x1.09ff5p45, 0x1.32281ep46 }, + { 0x1.008c8p45, 0x1.2a8dcep46 }, + { 0x1.eeaca8p44, 0x1.22ff72p46 }, + { 0x1.dcb8cap44, 0x1.1b7e98p46 }, + { 0x1.cb3c86p44, 0x1.140cc4p46 }, + { 0x1.ba36dap44, 0x1.0cab62p46 }, + { 0x1.a9a6bap44, 0x1.055bd6p46 }, + { 0x1.998afap44, 0x1.fc3ee6p45 }, + { 0x1.89e25ep44, 0x1.edeeeep45 }, + { 0x1.7aab98p44, 0x1.dfca26p45 }, + { 0x1.6be542p44, 0x1.d1d2dp45 }, + { 0x1.5d8decp44, 0x1.c40b08p45 }, + { 0x1.4fa40ep44, 0x1.b674c8p45 }, + { 0x1.422616p44, 0x1.a911fp45 }, + { 0x1.351262p44, 0x1.9be438p45 }, + { 0x1.28674p44, 0x1.8eed36p45 }, + { 0x1.1c22f8p44, 0x1.822e66p45 }, + { 0x1.1043c2p44, 0x1.75a91ap45 }, + { 0x1.04c7cap44, 0x1.695e8cp45 }, + { 0x1.f35a72p43, 0x1.5d4fd4p45 }, + { 0x1.dde456p43, 0x1.517de6p45 }, + { 0x1.c9296cp43, 0x1.45e99cp45 }, + { 0x1.b525d6p43, 0x1.3a93b2p45 }, + { 0x1.a1d5a6p43, 0x1.2f7cc4p45 }, + { 0x1.8f34eap43, 0x1.24a554p45 }, + { 0x1.7d3fa6p43, 0x1.1a0dc6p45 }, + { 0x1.6bf1dcp43, 0x1.0fb662p45 }, + { 0x1.5b4784p43, 0x1.059f5ap45 }, + { 0x1.4b3c98p43, 0x1.f79184p44 }, + { 0x1.3bcd14p43, 0x1.e4653p44 }, + { 0x1.2cf4eep43, 0x1.d1b982p44 }, + { 0x1.1eb024p43, 0x1.bf8e1cp44 }, + { 0x1.10fab8p43, 0x1.ade26cp44 }, + { 0x1.03d0acp43, 0x1.9cb5bep44 }, + { 0x1.ee5c18p42, 0x1.8c0732p44 }, + { 0x1.d61dd6p42, 0x1.7bd5c8p44 }, + { 0x1.bedec8p42, 0x1.6c2056p44 }, + { 0x1.a8973cp42, 0x1.5ce596p44 }, + { 0x1.933f9p42, 0x1.4e241ep44 }, + { 0x1.7ed03ap42, 0x1.3fda6cp44 }, + { 0x1.6b41ccp42, 0x1.3206dcp44 }, + { 0x1.588cf2p42, 0x1.24a7b8p44 }, + { 0x1.46aa72p42, 0x1.17bb2cp44 }, + { 0x1.359332p42, 0x1.0b3f52p44 }, + { 0x1.254038p42, 0x1.fe646p43 }, + { 0x1.15aaa8p42, 0x1.e72372p43 }, + { 0x1.06cbcap42, 0x1.d0b7ap43 }, + { 0x1.f13a04p41, 0x1.bb1c98p43 }, + { 0x1.d62fbep41, 0x1.a64de6p43 }, + { 0x1.bc6c1ep41, 0x1.92470ap43 }, + { 0x1.a3e2ccp41, 0x1.7f036cp43 }, + { 0x1.8c87b8p41, 0x1.6c7e64p43 }, + { 0x1.764f2p41, 0x1.5ab342p43 }, + { 0x1.612d8ap41, 0x1.499d48p43 }, + { 0x1.4d17cap41, 0x1.3937b2p43 }, + { 0x1.3a03p41, 0x1.297dbap43 }, + { 0x1.27e498p41, 0x1.1a6a96p43 }, + { 0x1.16b24cp41, 0x1.0bf97ep43 }, + { 0x1.066222p41, 0x1.fc4b5ep42 }, + { 0x1.edd4d2p40, 0x1.e1d4dp42 }, + { 0x1.d08382p40, 0x1.c885ep42 }, + { 0x1.b4be2p40, 0x1.b0553p42 }, + { 0x1.9a7316p40, 0x1.99397ap42 }, + { 0x1.81915cp40, 0x1.83298ep42 }, + { 0x1.6a088p40, 0x1.6e1c58p42 }, + { 0x1.53c89ep40, 0x1.5a08e8p42 }, + { 0x1.3ec25ep40, 0x1.46e66cp42 }, + { 0x1.2ae6fap40, 0x1.34ac36p42 }, + { 0x1.18282ep40, 0x1.2351c2p42 }, + { 0x1.067844p40, 0x1.12ceb4p42 }, + { 0x1.eb940ep39, 0x1.031ad6p42 }, + { 0x1.cc2186p39, 0x1.e85c44p41 }, + { 0x1.ae808cp39, 0x1.cc018p41 }, + { 0x1.9299bp39, 0x1.b1160ap41 }, + { 0x1.785674p39, 0x1.978ae8p41 }, + { 0x1.5fa14ap39, 0x1.7f5188p41 }, + { 0x1.486586p39, 0x1.685bb6p41 }, + { 0x1.328f5ep39, 0x1.529b9ep41 }, + { 0x1.1e0be6p39, 0x1.3e03d8p41 }, + { 0x1.0ac8fcp39, 0x1.2a875cp41 }, + { 0x1.f16aaep38, 0x1.181984p41 }, + { 0x1.cf80d4p38, 0x1.06ae14p41 }, + { 0x1.afb4e2p38, 0x1.ec7262p40 }, + { 0x1.91e8bep38, 0x1.cd5ecap40 }, + { 0x1.75ffb4p38, 0x1.b00b38p40 }, + { 0x1.5bde72p38, 0x1.94624ep40 }, + { 0x1.436af4p38, 0x1.7a4f6ap40 }, + { 0x1.2c8c7ap38, 0x1.61beaep40 }, + { 0x1.172b7ap38, 0x1.4a9cf6p40 }, + { 0x1.033198p38, 0x1.34d7dcp40 }, + { 0x1.e11332p37, 0x1.205dacp40 }, + { 0x1.be3ebp37, 0x1.0d1d6ap40 }, + { 0x1.9dbf72p37, 0x1.f60d8ap39 }, + { 0x1.7f714p37, 0x1.d4143ap39 }, + { 0x1.6331cap37, 0x1.b430ecp39 }, + { 0x1.48e09cp37, 0x1.9646f4p39 }, + { 0x1.305ef8p37, 0x1.7a3adep39 }, + { 0x1.198fd6p37, 0x1.5ff276p39 }, + { 0x1.0457c6p37, 0x1.4754acp39 }, + { 0x1.e139bcp36, 0x1.30499cp39 }, + { 0x1.bc8d52p36, 0x1.1aba78p39 }, + { 0x1.9a7c3p36, 0x1.06918cp39 }, + { 0x1.7adadep36, 0x1.e77448p38 }, + { 0x1.5d806ap36, 0x1.c4412cp38 }, + { 0x1.424642p36, 0x1.a36454p38 }, + { 0x1.290826p36, 0x1.84ba3p38 }, + { 0x1.11a3f8p36, 0x1.6821p38 }, + { 0x1.f7f358p35, 0x1.4d78bcp38 }, + { 0x1.cfd652p35, 0x1.34a306p38 }, + { 0x1.aab85ap35, 0x1.1d8318p38 }, + { 0x1.88647p35, 0x1.07fdb4p38 }, + { 0x1.68a8e4p35, 0x1.e7f232p37 }, + { 0x1.4b5726p35, 0x1.c2b9dp37 }, + { 0x1.30439cp35, 0x1.a02436p37 }, + { 0x1.174578p35, 0x1.8005fp37 }, + { 0x1.003692p35, 0x1.6235fcp37 }, + { 0x1.d5e678p34, 0x1.468daep37 }, + { 0x1.aeb442p34, 0x1.2ce898p37 }, + { 0x1.8a9848p34, 0x1.15246ep37 }, + { 0x1.695876p34, 0x1.fe41cep36 }, + { 0x1.4abea2p34, 0x1.d57f52p36 }, + { 0x1.2e984ep34, 0x1.afc85ep36 }, + { 0x1.14b676p34, 0x1.8ce75ep36 }, + { 0x1.f9daap33, 0x1.6caa0ep36 }, + { 0x1.ce283ap33, 0x1.4ee142p36 }, + { 0x1.a609f8p33, 0x1.3360ccp36 }, + { 0x1.81396ap33, 0x1.19ff46p36 }, + { 0x1.5f7524p33, 0x1.0295fp36 }, + { 0x1.40806ep33, 0x1.da011p35 }, + { 0x1.2422eep33, 0x1.b23a5ap35 }, + { 0x1.0a286p33, 0x1.8d986ap35 }, + { 0x1.e4c0bp32, 0x1.6be022p35 }, + { 0x1.b93bf4p32, 0x1.4cda54p35 }, + { 0x1.916f7cp32, 0x1.30539p35 }, + { 0x1.6d0e7p32, 0x1.161be4p35 }, + { 0x1.4bd1cp32, 0x1.fc0d56p34 }, + { 0x1.2d77bep32, 0x1.cfd4a6p34 }, + { 0x1.11c3bep32, 0x1.a74068p34 }, + { 0x1.f0fb86p31, 0x1.8208bcp34 }, + { 0x1.c2e43ep31, 0x1.5feadap34 }, + { 0x1.98e254p31, 0x1.40a8c2p34 }, + { 0x1.729df6p31, 0x1.2408eap34 }, + { 0x1.4fc63cp31, 0x1.09d5f8p34 }, + { 0x1.3010aap31, 0x1.e3bcf4p33 }, + { 0x1.1338b8p31, 0x1.b7e946p33 }, + { 0x1.f1fecp30, 0x1.8fdc1cp33 }, + { 0x1.c2556ap30, 0x1.6b4702p33 }, + { 0x1.970b06p30, 0x1.49e178p33 }, + { 0x1.6fbddep30, 0x1.2b6876p33 }, + { 0x1.4c144ep30, 0x1.0f9e1cp33 }, + { 0x1.2bbc1ep30, 0x1.ec929ap32 }, + { 0x1.0e69f2p30, 0x1.be6abcp32 }, + { 0x1.e7b188p29, 0x1.94637ep32 }, + { 0x1.b792bcp29, 0x1.6e2368p32 }, + { 0x1.8c03d2p29, 0x1.4b581cp32 }, + { 0x1.649b02p29, 0x1.2bb5ccp32 }, + { 0x1.40f794p29, 0x1.0ef6c4p32 }, + { 0x1.20c13p29, 0x1.e9b5e8p31 }, + { 0x1.03a72ap29, 0x1.ba4f04p31 }, + { 0x1.d2bfc6p28, 0x1.8f4cccp31 }, + { 0x1.a35068p28, 0x1.684c22p31 }, + { 0x1.7885cep28, 0x1.44f21ep31 }, + { 0x1.51f06ap28, 0x1.24eb72p31 }, + { 0x1.2f2aaap28, 0x1.07ebd2p31 }, + { 0x1.0fd816p28, 0x1.db5adp30 }, + { 0x1.e7493p27, 0x1.abe09ep30 }, + { 0x1.b48774p27, 0x1.80f43ap30 }, + { 0x1.86e006p27, 0x1.5a2aep30 }, + { 0x1.5dd4bp27, 0x1.37231p30 }, + { 0x1.38f2e8p27, 0x1.1783cep30 }, + { 0x1.17d2c6p27, 0x1.f5f7d8p29 }, + { 0x1.f42c18p26, 0x1.c282cep29 }, + { 0x1.beceb2p26, 0x1.94219cp29 }, + { 0x1.8ef2aap26, 0x1.6a5972p29 }, + { 0x1.640bf6p26, 0x1.44ba86p29 }, + { 0x1.3d9be6p26, 0x1.22df2ap29 }, + { 0x1.1b2fe4p26, 0x1.046aeap29 }, + { 0x1.f8c0c2p25, 0x1.d21398p28 }, + { 0x1.c19fa8p25, 0x1.a0df1p28 }, + { 0x1.90538cp25, 0x1.74adc8p28 }, + { 0x1.6443fep25, 0x1.4d0232p28 }, + { 0x1.3ce784p25, 0x1.296a7p28 }, + { 0x1.19c232p25, 0x1.097f62p28 }, + { 0x1.f4c8c4p24, 0x1.d9c736p27 }, + { 0x1.bcd30ep24, 0x1.a6852cp27 }, + { 0x1.8aee4cp24, 0x1.789fb8p27 }, + { 0x1.5e77b6p24, 0x1.4f8c96p27 }, + { 0x1.36dcf2p24, 0x1.2acee2p27 }, + { 0x1.139a7cp24, 0x1.09f5dp27 }, + { 0x1.e8747p23, 0x1.d9371ep26 }, + { 0x1.b0a44ap23, 0x1.a4c89ep26 }, + { 0x1.7f064ap23, 0x1.75fa8ep26 }, + { 0x1.52efep23, 0x1.4c37cp26 }, + { 0x1.2bc82ap23, 0x1.26f9ep26 }, + { 0x1.09064p23, 0x1.05c804p26 }, + { 0x1.d45f16p22, 0x1.d06ad6p25 }, + { 0x1.9dacb2p22, 0x1.9bc0ap25 }, + { 0x1.6d3126p22, 0x1.6ce1aap25 }, + { 0x1.423d14p22, 0x1.43302cp25 }, + { 0x1.1c33cep22, 0x1.1e1e86p25 }, + { 0x1.f512dep21, 0x1.fa5b5p24 }, + { 0x1.b9823cp21, 0x1.bfd756p24 }, + { 0x1.84d6fep21, 0x1.8be4f8p24 }, + { 0x1.564a92p21, 0x1.5dcd66p24 }, + { 0x1.2d2c0ap21, 0x1.34ecf8p24 }, + { 0x1.08ddd2p21, 0x1.10b148p24 }, + { 0x1.d1a75p20, 0x1.e12eep23 }, + { 0x1.99218cp20, 0x1.a854eap23 }, + { 0x1.674c6ap20, 0x1.7603bap23 }, + { 0x1.3b62b6p20, 0x1.4980ccp23 }, + { 0x1.14b54p20, 0x1.2225b2p23 }, + { 0x1.e55102p19, 0x1.febc1p22 }, + { 0x1.a964eep19, 0x1.c14b22p22 }, + { 0x1.74b17ap19, 0x1.8b0cfcp22 }, + { 0x1.465daap19, 0x1.5b2fe6p22 }, + { 0x1.1da944p19, 0x1.30f93cp22 }, + { 0x1.f3d41p18, 0x1.0bc30cp22 }, + { 0x1.b512a2p18, 0x1.d5f3a8p21 }, + { 0x1.7e03b2p18, 0x1.9c3518p21 }, + { 0x1.4dbb98p18, 0x1.6961b8p21 }, + { 0x1.236a1ap18, 0x1.3cab14p21 }, + { 0x1.fcae94p17, 0x1.155a0ap21 }, + { 0x1.bbc1ap17, 0x1.e5989p20 }, + { 0x1.82eedcp17, 0x1.a8e406p20 }, + { 0x1.5139a6p17, 0x1.7397c6p20 }, + { 0x1.25c354p17, 0x1.44d26ep20 }, + { 0x1.ff8f84p16, 0x1.1bcca4p20 }, + { 0x1.bd3474p16, 0x1.efac52p19 }, + { 0x1.834586p16, 0x1.b0a68ap19 }, + { 0x1.50b75cp16, 0x1.7974e8p19 }, + { 0x1.249ef2p16, 0x1.4924a8p19 }, + { 0x1.fc5b88p15, 0x1.1edfa4p19 }, + { 0x1.b95ceep15, 0x1.f3d218p18 }, + { 0x1.7f03bap15, 0x1.b334fap18 }, + { 0x1.4c389cp15, 0x1.7ac2d8p18 }, + { 0x1.2006aep15, 0x1.4979acp18 }, + { 0x1.f32eap14, 0x1.1e767cp18 }, + { 0x1.b05cfep14, 0x1.f1e352p17 }, + { 0x1.764f46p14, 0x1.b0778cp17 }, + { 0x1.43e56cp14, 0x1.77756ep17 }, + { 0x1.18238p14, 0x1.45ce66p17 }, + { 0x1.e45a98p13, 0x1.1a95p17 }, + { 0x1.a284ccp13, 0x1.e9f2p16 }, + { 0x1.697596p13, 0x1.a887bep16 }, + { 0x1.3807acp13, 0x1.6fab64p16 }, + { 0x1.0d3b36p13, 0x1.3e44e4p16 }, + { 0x1.d0624p12, 0x1.135f28p16 }, + { 0x1.904e0cp12, 0x1.dc479ep15 }, + { 0x1.58e72ap12, 0x1.9baed4p15 }, + { 0x1.2906ccp12, 0x1.63ac6cp15 }, + { 0x1.ff58dap11, 0x1.33225ap15 }, + { 0x1.b7f1f4p11, 0x1.0916fp15 }, + { 0x1.7a551p11, 0x1.c960cp14 }, + { 0x1.453142p11, 0x1.8a6174p14 }, + { 0x1.1761f8p11, 0x1.53e4f8p14 }, + { 0x1.dfd296p10, 0x1.24caf2p14 }, + { 0x1.9bd5fp10, 0x1.f830cp13 }, + { 0x1.61501p10, 0x1.b1e5acp13 }, + { 0x1.2ef6p10, 0x1.7538c6p13 }, + { 0x1.03a918p10, 0x1.40dfd8p13 }, + { 0x1.bce26ap9, 0x1.13bc08p13 }, + { 0x1.7cef42p9, 0x1.d9a88p12 }, + { 0x1.46056p9, 0x1.96a0b4p12 }, + { 0x1.16e3cap9, 0x1.5ce9acp12 }, + { 0x1.dcea68p8, 0x1.2b3e54p12 }, + { 0x1.97945ap8, 0x1.0085p12 }, + { 0x1.5c2828p8, 0x1.b7937ep11 }, + { 0x1.29415p8, 0x1.7872dap11 }, + { 0x1.fb58fap7, 0x1.423acp11 }, + { 0x1.b0c1a8p7, 0x1.13af5p11 }, + { 0x1.70f474p7, 0x1.d77f0cp10 }, + { 0x1.3a68a8p7, 0x1.92ff34p10 }, + { 0x1.0bcc6p7, 0x1.5847eep10 }, + { 0x1.c7fa0cp6, 0x1.25f9eep10 }, + { 0x1.8401b6p6, 0x1.f5cc78p9 }, + { 0x1.4a029ap6, 0x1.ac0f6p9 }, + { 0x1.188c46p6, 0x1.6cfa9cp9 }, + { 0x1.dcc4fap5, 0x1.370ab8p9 }, + { 0x1.94ec06p5, 0x1.08f24p9 }, + { 0x1.57bc96p5, 0x1.c324c2p8 }, + { 0x1.23a81ap5, 0x1.7fe904p8 }, + { 0x1.eeb278p4, 0x1.46897ep8 }, + { 0x1.a35794p4, 0x1.159a38p8 }, + { 0x1.634b8p4, 0x1.d7c594p7 }, + { 0x1.2ce2a4p4, 0x1.90ae4ep7 }, + { 0x1.fd5f08p3, 0x1.5422fp7 }, + { 0x1.aef3cep3, 0x1.20998p7 }, + { 0x1.6c6e62p3, 0x1.e98102p6 }, + { 0x1.3407b6p3, 0x1.9eee06p6 }, + { 0x1.043bap3, 0x1.5f8b88p6 }, + { 0x1.b77e5cp2, 0x1.29b294p6 }, + { 0x1.72f0c4p2, 0x1.f7f338p5 }, + { 0x1.38ee18p2, 0x1.aa5772p5 }, + { 0x1.07dd68p2, 0x1.68823ep5 }, + { 0x1.bcc58ep1, 0x1.30b14ep5 }, + { 0x1.76aca4p1, 0x1.01647cp5 }, + { 0x1.3b7912p1, 0x1.b2a87ep4 }, + { 0x1.097f82p1, 0x1.6ed2f2p4 }, + { 0x1.beaa3ep0, 0x1.356cd6p4 }, + { 0x1.778be2p0, 0x1.04e15ep4 }, + { 0x1.3b9984p0, 0x1.b7b04p3 }, + { 0x1.09182cp0, 0x1.725862p3 }, + { 0x1.bd20fcp-1, 0x1.37c92cp3 }, + { 0x1.75892p-1, 0x1.065b96p3 }, + { 0x1.394e7ap-1, 0x1.b950d4p2 }, + { 0x1.06a996p-1, 0x1.72fd94p2 }, + { 0x1.b8328ep-2, 0x1.37b83cp2 }, + { 0x1.70aff4p-2, 0x1.05ca5p2 }, + { 0x1.34a53cp-2, 0x1.b7807ep1 }, + { 0x1.0241dep-2, 0x1.70bebp1 }, + { 0x1.affb9p-3, 0x1.353a6cp1 }, + { 0x1.691c7cp-3, 0x1.0330fp1 }, + { 0x1.2db8cap-3, 0x1.b24a16p0 }, + { 0x1.f7f4f8p-4, 0x1.6ba91ap0 }, + { 0x1.a4ab64p-4, 0x1.305e98p0 }, + { 0x1.5efa4ep-4, 0x1.fd3de2p-1 }, + { 0x1.24b0d8p-4, 0x1.a9cc94p-1 }, + { 0x1.e7eeap-5, 0x1.63daf8p-1 }, + { 0x1.96826ep-5, 0x1.294176p-1 }, + { 0x1.5282d2p-5, 0x1.f05e82p-2 }, + { 0x1.19c05p-5, 0x1.9e39dcp-2 }, + { 0x1.d4ca9cp-6, 0x1.5982p-2 }, + { 0x1.85cfacp-6, 0x1.200c8ap-2 }, + { 0x1.43fb32p-6, 0x1.e00e92p-3 }, + { 0x1.0d2382p-6, 0x1.8fd4ep-3 }, + { 0x1.bef1b2p-7, 0x1.4cd9cp-3 }, + { 0x1.72ede4p-7, 0x1.14f48ap-3 }, + { 0x1.33b1cap-7, 0x1.ccaaeap-4 }, + { 0x1.fe3bdp-8, 0x1.7eef14p-4 }, + { 0x1.a6d7d2p-8, 0x1.3e2964p-4 }, + { 0x1.5e4062p-8, 0x1.083768p-4 }, + { 0x1.21fb7ap-8, 0x1.b69f1p-5 }, + { 0x1.dfefbep-9, 0x1.6be574p-5 }, + { 0x1.8cf816p-9, 0x1.2dc11ap-5 }, + { 0x1.482fa8p-9, 0x1.f4343cp-6 }, + { 0x1.0f30c4p-9, 0x1.9e614ep-6 }, + { 0x1.bff86ep-10, 0x1.571d34p-6 }, + { 0x1.71d0b6p-10, 0x1.1bf742p-6 }, + { 0x1.3125f6p-10, 0x1.d5cc6cp-7 }, + { 0x1.f755eap-11, 0x1.846e9ep-7 }, + { 0x1.9eebaap-11, 0x1.410048p-7 }, + { 0x1.55df18p-11, 0x1.09258p-7 }, + { 0x1.198c18p-11, 0x1.b5ceb6p-8 }, + { 0x1.cf82ep-12, 0x1.69468p-8 }, + { 0x1.7d5af6p-12, 0x1.29f9e8p-8 }, + { 0x1.399c28p-12, 0x1.eb4b9ep-9 }, + { 0x1.01c65ap-12, 0x1.94d1dep-9 }, + { 0x1.a78e82p-13, 0x1.4d6706p-9 }, + { 0x1.5bcf92p-13, 0x1.127346p-9 }, + { 0x1.1d791cp-13, 0x1.c39fap-10 }, + { 0x1.d463dcp-14, 0x1.73679cp-10 }, + { 0x1.8011fcp-14, 0x1.314916p-10 }, + { 0x1.3ac71cp-14, 0x1.f5a11ap-11 }, + { 0x1.01dcc2p-14, 0x1.9beca8p-11 }, + { 0x1.a6459cp-15, 0x1.52189ap-11 }, + { 0x1.59962ap-15, 0x1.155d48p-11 }, + { 0x1.1ab0e4p-15, 0x1.c6dc8ap-12 }, + { 0x1.ce42dep-16, 0x1.74ca88p-12 }, + { 0x1.79c43p-16, 0x1.31612ap-12 }, + { 0x1.349128p-16, 0x1.f4125ap-13 }, + { 0x1.f7d80ep-17, 0x1.993e82p-13 }, + { 0x1.9b270cp-17, 0x1.4ec006p-13 }, + { 0x1.4f59fap-17, 0x1.11aebp-13 }, + { 0x1.1164acp-17, 0x1.bf4ab2p-14 }, + { 0x1.bd8c96p-18, 0x1.6d561ep-14 }, + { 0x1.6ae172p-18, 0x1.2a406ep-14 }, + { 0x1.276874p-18, 0x1.e6bba6p-15 }, + { 0x1.e0bad2p-19, 0x1.8cf814p-15 }, + { 0x1.86f788p-19, 0x1.4399f8p-15 }, + { 0x1.3dcfaep-19, 0x1.07aa3p-15 }, + { 0x1.023828p-19, 0x1.ad7302p-16 }, + { 0x1.a3666ep-20, 0x1.5d90f4p-16 }, + { 0x1.546e38p-20, 0x1.1c674ep-16 }, + { 0x1.143264p-20, 0x1.ce8ccp-17 }, + { 0x1.bff316p-21, 0x1.77f562p-17 }, + { 0x1.6b13ecp-21, 0x1.316da8p-17 }, + { 0x1.2624f4p-21, 0x1.f0046p-18 }, + { 0x1.dc5de4p-22, 0x1.92920ap-18 }, + { 0x1.818d3ap-22, 0x1.4691b2p-18 }, + { 0x1.37e62p-22, 0x1.08c96ap-18 }, + { 0x1.f8637ep-23, 0x1.ad2d0ap-19 }, + { 0x1.97a3dcp-23, 0x1.5ba462p-19 }, + { 0x1.494a4p-23, 0x1.1975ep-19 }, + { 0x1.09dee4p-23, 0x1.c78892p-20 }, + { 0x1.ad1fap-24, 0x1.7073c4p-20 }, + { 0x1.5a245ep-24, 0x1.29df48p-20 }, + { 0x1.171278p-24, 0x1.e163bep-21 }, + { 0x1.c1c74cp-25, 0x1.84cbbp-21 }, + { 0x1.6a46f4p-25, 0x1.39dbcep-21 }, + { 0x1.23a858p-25, 0x1.fa7b92p-22 }, + { 0x1.d56196p-26, 0x1.9876ap-22 }, + { 0x1.7984b6p-26, 0x1.4940bcp-22 }, + { 0x1.2f7cc4p-26, 0x1.094608p-22 }, + { 0x1.e7b62cp-27, 0x1.ab3e8cp-23 }, + { 0x1.87b15ep-27, 0x1.57e33ep-23 }, + { 0x1.3a6dp-27, 0x1.14a8b6p-23 }, + { 0x1.f88ebap-28, 0x1.bcede6p-24 }, + { 0x1.94a282p-28, 0x1.659918p-24 }, + { 0x1.44580ap-28, 0x1.1f4498p-24 }, + { 0x1.03dbf8p-28, 0x1.cd5086p-25 }, + { 0x1.a03066p-29, 0x1.723974p-25 }, + { 0x1.4d1f2ep-29, 0x1.28f9cap-25 }, + { 0x1.0a814ap-29, 0x1.dc34b6p-26 }, + { 0x1.aa36cap-30, 0x1.7d9dbp-26 }, + { 0x1.54a6b6p-30, 0x1.31aa56p-26 }, + { 0x1.102232p-30, 0x1.e96c26p-27 }, + { 0x1.b2959ep-31, 0x1.87a218p-27 }, + { 0x1.5ad66cp-31, 0x1.393ad2p-27 }, + { 0x1.14ac7ep-31, 0x1.f4ccdap-28 }, + { 0x1.b931b8p-32, 0x1.9026a8p-28 }, + { 0x1.5f9a24p-32, 0x1.3f92eap-28 }, + { 0x1.181154p-32, 0x1.fe3208p-29 }, + { 0x1.bdf55ep-33, 0x1.970fbp-29 }, + { 0x1.62e226p-33, 0x1.449de6p-29 }, + { 0x1.1a4576p-33, 0x1.02be7p-29 }, + { 0x1.c0d0bep-34, 0x1.9c4672p-30 }, + { 0x1.64a386p-34, 0x1.484b1ep-30 }, + { 0x1.1b418cp-34, 0x1.054a9ap-30 }, + { 0x1.c1ba4ap-35, 0x1.9fb994p-31 }, + { 0x1.64d86p-35, 0x1.4a8e4ep-31 }, + { 0x1.1b0242p-35, 0x1.06b4fep-31 }, + { 0x1.c0aee6p-36, 0x1.a15d86p-32 }, + { 0x1.637ffap-36, 0x1.4b5fdep-32 }, + { 0x1.198862p-36, 0x1.06f8dap-32 }, + { 0x1.bdb204p-37, 0x1.a12cc8p-33 }, + { 0x1.609ec2p-37, 0x1.4abd0ap-33 }, + { 0x1.16d8d2p-37, 0x1.06154ap-33 }, + { 0x1.b8cd88p-38, 0x1.9f27fap-34 }, + { 0x1.5c3e42p-38, 0x1.48a7fcp-34 }, + { 0x1.12fc6cp-38, 0x1.040d4ap-34 }, + { 0x1.b2119p-39, 0x1.9b55e8p-35 }, + { 0x1.566cep-39, 0x1.4527acp-35 }, + { 0x1.0dffep-39, 0x1.00e7acp-35 }, + { 0x1.a99426p-40, 0x1.95c358p-36 }, + { 0x1.4f3d92p-40, 0x1.4047cep-36 }, + { 0x1.07f35ep-40, 0x1.f95dcep-37 }, + { 0x1.9f70cp-41, 0x1.8e82cep-37 }, + { 0x1.46c77ap-41, 0x1.3a1882p-37 }, + { 0x1.00ea48p-41, 0x1.eee1d4p-38 }, + { 0x1.93c7acp-42, 0x1.85ac18p-38 }, + { 0x1.3d256ap-42, 0x1.32ae04p-38 }, + { 0x1.f1f59p-43, 0x1.e27d88p-39 }, + { 0x1.86bd6ap-43, 0x1.7b5bdap-39 }, + { 0x1.327554p-43, 0x1.2a2036p-39 }, + { 0x1.e07ab4p-44, 0x1.d458ap-40 }, + { 0x1.7879ecp-44, 0x1.6fb2eap-40 }, + { 0x1.26d7bp-44, 0x1.208a2cp-40 }, + { 0x1.cd98a2p-45, 0x1.c49f8ap-41 }, + { 0x1.6927c2p-45, 0x1.62d5aap-41 }, + { 0x1.1a6ed6p-45, 0x1.16098ep-41 }, + { 0x1.b986acp-46, 0x1.b3828ep-42 }, + { 0x1.58f35ap-46, 0x1.54eb3ep-42 }, + { 0x1.0d5e6p-46, 0x1.0abe0ep-42 }, + { 0x1.a47db6p-47, 0x1.a134d4p-43 }, + { 0x1.480a18p-47, 0x1.461cdap-43 }, + { 0x1.ff94e4p-48, 0x1.fd9182p-44 }, + { 0x1.8eb738p-48, 0x1.8deb62p-44 }, + { 0x1.369994p-48, 0x1.3694e8p-44 }, + { 0x1.e3ae4ap-49, 0x1.e49706p-45 }, + { 0x1.786c3ep-49, 0x1.79dc28p-45 }, + { 0x1.24cec8p-49, 0x1.267e46p-45 }, + { 0x1.c74fc4p-50, 0x1.cad0bp-46 }, + { 0x1.61d46cp-50, 0x1.653d08p-46 }, + { 0x1.12d55cp-50, 0x1.16038cp-46 }, + { 0x1.aabdacp-51, 0x1.b081aap-47 }, + { 0x1.4b252ep-51, 0x1.5042e2p-47 }, + { 0x1.00d6f8p-51, 0x1.054e44p-47 }, + { 0x1.8e38ep-52, 0x1.95eb2cp-48 }, + { 0x1.3490e8p-52, 0x1.3b20c6p-48 }, + { 0x1.ddf56ap-53, 0x1.e90cb6p-49 }, + { 0x1.71fdep-53, 0x1.7b4b76p-49 }, + { 0x1.1e465ap-53, 0x1.26072ap-49 }, + { 0x1.bac92ep-54, 0x1.c7a2ecp-50 }, + { 0x1.56441cp-54, 0x1.60dcfp-50 }, + { 0x1.08700cp-54, 0x1.112346p-50 }, + { 0x1.986a66p-55, 0x1.a6a50ap-51 }, + { 0x1.3b3d56p-55, 0x1.46d572p-51 }, + { 0x1.e667dap-56, 0x1.f93d0ep-52 }, + { 0x1.7712b8p-56, 0x1.86529ep-52 }, + { 0x1.211544p-56, 0x1.2d65aep-52 }, + { 0x1.bd660ap-57, 0x1.d13c32p-53 }, + { 0x1.56f3eep-57, 0x1.66e45ap-53 }, + { 0x1.07f14ap-57, 0x1.14b8b6p-53 }, + { 0x1.96129cp-58, 0x1.aa854cp-54 }, + { 0x1.3837cp-58, 0x1.488b94p-54 }, + { 0x1.dfe0c2p-59, 0x1.f9e772p-55 }, + { 0x1.709b5ap-59, 0x1.85503p-55 }, + { 0x1.1affd2p-59, 0x1.2b7218p-55 }, + { 0x1.b2564p-60, 0x1.cc6bb6p-56 }, + { 0x1.4d23fap-60, 0x1.61cb1ap-56 }, + { 0x1.fecbdp-61, 0x1.0fba0ep-56 }, + { 0x1.8767d8p-61, 0x1.a13072p-57 }, + { 0x1.2bc67ep-61, 0x1.401abcp-57 }, + { 0x1.caf846p-62, 0x1.eafc2cp-58 }, + { 0x1.5f2e7ap-62, 0x1.785cp-58 }, + { 0x1.0c93acp-62, 0x1.205a7ep-58 }, + { 0x1.9a9b06p-63, 0x1.b9a31ap-59 }, + { 0x1.39b7fcp-63, 0x1.520968p-59 }, + { 0x1.df277ap-64, 0x1.029ce6p-59 }, + { 0x1.6dbcdp-64, 0x1.8b81d6p-60 }, + { 0x1.17080ap-64, 0x1.2e48f2p-60 }, + { 0x1.a98e26p-65, 0x1.cdd86cp-61 }, + { 0x1.445a6ap-65, 0x1.60a47ap-61 }, + { 0x1.ee324ep-66, 0x1.0d210cp-61 }, + { 0x1.784e3p-66, 0x1.9a961ep-62 }, + { 0x1.1e65fep-66, 0x1.390b74p-62 }, + { 0x1.b3bb86p-67, 0x1.dd1e52p-63 }, + { 0x1.4b4e36p-67, 0x1.6b6a7ap-63 }, + { 0x1.f790f6p-68, 0x1.14acc2p-63 }, + { 0x1.7e82cep-68, 0x1.a511aap-64 }, + { 0x1.226a7ap-68, 0x1.404114p-64 }, + { 0x1.b8c634p-69, 0x1.e6ea96p-65 }, + { 0x1.4e53acp-69, 0x1.71f97ap-65 }, + { 0x1.faed5cp-70, 0x1.18fb2ep-65 }, + { 0x1.80217ep-70, 0x1.aa947ep-66 }, + { 0x1.22f066p-70, 0x1.43a796p-66 }, + { 0x1.b87f86p-71, 0x1.eae2fp-67 }, + { 0x1.4d4ec8p-71, 0x1.7414e6p-67 }, + { 0x1.f8283ep-72, 0x1.19e474p-67 }, + { 0x1.7d1b22p-72, 0x1.aaeb7ep-68 }, + { 0x1.1ff2dp-72, 0x1.431f66p-68 }, + { 0x1.b2e9e8p-73, 0x1.e8e272p-69 }, + { 0x1.4848dep-73, 0x1.71a91ep-69 }, + { 0x1.ef5b16p-74, 0x1.176014p-69 }, + { 0x1.758b92p-74, 0x1.a6137cp-70 }, + { 0x1.198d42p-74, 0x1.3ead74p-70 }, + { 0x1.a838bp-75, 0x1.e0fbc2p-71 }, + { 0x1.3f700cp-75, 0x1.6accaep-71 }, + { 0x1.e0d68ep-76, 0x1.118578p-71 }, + { 0x1.69b7f4p-76, 0x1.9c3974p-72 }, + { 0x1.0ffa12p-76, 0x1.367afap-72 }, + { 0x1.98cd1cp-77, 0x1.d377fap-73 }, + { 0x1.33148p-77, 0x1.5fbee6p-73 }, + { 0x1.cd1dbap-78, 0x1.088a8p-73 }, + { 0x1.5a0a9cp-78, 0x1.8db7ccp-74 }, + { 0x1.038ef4p-78, 0x1.2ad2ecp-74 }, + { 0x1.85308ap-79, 0x1.c0d23ep-75 }, + { 0x1.23a3cp-79, 0x1.50e41ap-75 }, + { 0x1.b4de68p-80, 0x1.f980a8p-76 }, + { 0x1.470ce4p-80, 0x1.7b10fep-76 }, + { 0x1.e9700cp-81, 0x1.1c1d98p-76 }, + { 0x1.6e0c9p-81, 0x1.a9b08p-77 }, + { 0x1.11a25ap-81, 0x1.3ebfb4p-77 }, + { 0x1.98e73ap-82, 0x1.dd1d36p-78 }, + { 0x1.315f58p-82, 0x1.64e7fp-78 }, + { 0x1.c7e35cp-83, 0x1.0ada94p-78 }, + { 0x1.542176p-83, 0x1.8ed9e8p-79 }, + { 0x1.fb491ep-84, 0x1.29ecb2p-79 }, + { 0x1.7a1c34p-84, 0x1.bcdb34p-80 }, + { 0x1.19b0f2p-84, 0x1.4bf6cap-80 }, + { 0x1.a383cap-85, 0x1.ef3318p-81 }, + { 0x1.383bf2p-85, 0x1.712bc2p-81 }, + { 0x1.d08cdap-86, 0x1.13151p-81 }, + { 0x1.596adp-86, 0x1.99bf36p-82 }, + { 0x1.00b602p-86, 0x1.3104d6p-82 }, + { 0x1.7d62a2p-87, 0x1.c5e534p-83 }, + { 0x1.1b2abcp-87, 0x1.518db2p-83 }, + { 0x1.a4480ep-88, 0x1.f5d1c6p-84 }, + { 0x1.37be42p-88, 0x1.74d45ap-84 }, + { 0x1.ce3ee4p-89, 0x1.14dc4ap-84 }, + { 0x1.568986p-89, 0x1.9afd0ep-85 }, + { 0x1.fb69c6p-90, 0x1.30e632p-85 }, + { 0x1.77a47ep-90, 0x1.c42b48p-86 }, + { 0x1.15f4ep-90, 0x1.4f1f52p-86 }, + { 0x1.9b25dcp-91, 0x1.f08156p-87 }, + { 0x1.2feeeep-91, 0x1.6f9f62p-87 }, + { 0x1.c122bcp-92, 0x1.100ffap-87 }, + { 0x1.4bb154p-92, 0x1.927ce6p-88 }, + { 0x1.e9ae56p-93, 0x1.2992f4p-88 }, + { 0x1.6948e8p-93, 0x1.b7cccap-89 }, + { 0x1.0a6cd2p-93, 0x1.44d7c4p-89 }, + { 0x1.88c0cap-94, 0x1.dfa22p-90 }, + { 0x1.215988p-94, 0x1.61eb26p-90 }, + { 0x1.aa222ap-95, 0x1.0506e2p-90 }, + { 0x1.39a30ep-95, 0x1.80d828p-91 }, + { 0x1.cd740ep-96, 0x1.1b8f04p-91 }, + { 0x1.534d82p-96, 0x1.a1a7ecp-92 }, + { 0x1.f2bb06p-97, 0x1.336f3p-92 }, + { 0x1.6e5b34p-97, 0x1.c46172p-93 }, + { 0x1.0cfc82p-97, 0x1.4cab82p-93 }, + { 0x1.8acc82p-98, 0x1.e9094cp-94 }, + { 0x1.219686p-98, 0x1.67465p-94 }, + { 0x1.a89fa6p-99, 0x1.07d0b8p-94 }, + { 0x1.372982p-99, 0x1.833ffap-95 }, + { 0x1.c7d094p-100, 0x1.1c147ap-95 }, + { 0x1.4db1c8p-100, 0x1.a096ccp-96 }, + { 0x1.e858d8p-101, 0x1.314decp-96 }, + { 0x1.6529ep-101, 0x1.bf46cep-97 }, + { 0x1.0517bap-101, 0x1.47796ap-97 }, + { 0x1.7d8a8p-102, 0x1.df49a2p-98 }, + { 0x1.16a46p-102, 0x1.5e9198p-98 }, + { 0x1.96ca76p-103, 0x1.004b34p-98 }, + { 0x1.28cb2cp-103, 0x1.768f3ep-99 }, + { 0x1.b0de98p-104, 0x1.1190d2p-99 }, + }, + }; diff --git a/math/aarch64/v_erff_data.c b/math/aarch64/v_erff_data.c new file mode 100644 index 000000000000..8d11d8b6c10b --- /dev/null +++ b/math/aarch64/v_erff_data.c @@ -0,0 +1,532 @@ +/* + * Data for approximation of erff. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Lookup table used in vector erff. + For each possible rounded input r (multiples of 1/128), between + r = 0.0 and r = 4.0 (513 values): + - the first entry __v_erff_data.tab.erf contains the values of erf(r), + - the second entry __v_erff_data.tab.scale contains the values of + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the + algorithm, since lookup is performed only for x >= 1/64-1/512. */ +const struct v_erff_data __v_erff_data = { + .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, + { 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, + { 0x1.20d770p-6, 0x1.20cb68p+0 }, + { 0x1.b137e0p-6, 0x1.20b4d8p+0 }, + { 0x1.20c564p-5, 0x1.209546p+0 }, + { 0x1.68e5d4p-5, 0x1.206cb4p+0 }, + { 0x1.b0fafep-5, 0x1.203b26p+0 }, + { 0x1.f902a8p-5, 0x1.2000a0p+0 }, + { 0x1.207d48p-4, 0x1.1fbd28p+0 }, + { 0x1.44703ep-4, 0x1.1f70c4p+0 }, + { 0x1.68591ap-4, 0x1.1f1b7ap+0 }, + { 0x1.8c36bep-4, 0x1.1ebd56p+0 }, + { 0x1.b00812p-4, 0x1.1e565cp+0 }, + { 0x1.d3cbf8p-4, 0x1.1de698p+0 }, + { 0x1.f7815ap-4, 0x1.1d6e14p+0 }, + { 0x1.0d9390p-3, 0x1.1cecdcp+0 }, + { 0x1.1f5e1ap-3, 0x1.1c62fap+0 }, + { 0x1.311fc2p-3, 0x1.1bd07cp+0 }, + { 0x1.42d7fcp-3, 0x1.1b3572p+0 }, + { 0x1.548642p-3, 0x1.1a91e6p+0 }, + { 0x1.662a0cp-3, 0x1.19e5eap+0 }, + { 0x1.77c2d2p-3, 0x1.19318cp+0 }, + { 0x1.895010p-3, 0x1.1874dep+0 }, + { 0x1.9ad142p-3, 0x1.17aff0p+0 }, + { 0x1.ac45e4p-3, 0x1.16e2d8p+0 }, + { 0x1.bdad72p-3, 0x1.160da4p+0 }, + { 0x1.cf076ep-3, 0x1.153068p+0 }, + { 0x1.e05354p-3, 0x1.144b3cp+0 }, + { 0x1.f190aap-3, 0x1.135e30p+0 }, + { 0x1.015f78p-2, 0x1.12695ep+0 }, + { 0x1.09eed6p-2, 0x1.116cd8p+0 }, + { 0x1.127632p-2, 0x1.1068bap+0 }, + { 0x1.1af54ep-2, 0x1.0f5d16p+0 }, + { 0x1.236bf0p-2, 0x1.0e4a08p+0 }, + { 0x1.2bd9dcp-2, 0x1.0d2fa6p+0 }, + { 0x1.343ed6p-2, 0x1.0c0e0ap+0 }, + { 0x1.3c9aa8p-2, 0x1.0ae550p+0 }, + { 0x1.44ed18p-2, 0x1.09b590p+0 }, + { 0x1.4d35f0p-2, 0x1.087ee4p+0 }, + { 0x1.5574f4p-2, 0x1.07416cp+0 }, + { 0x1.5da9f4p-2, 0x1.05fd3ep+0 }, + { 0x1.65d4b8p-2, 0x1.04b27cp+0 }, + { 0x1.6df50ap-2, 0x1.036140p+0 }, + { 0x1.760abap-2, 0x1.0209a6p+0 }, + { 0x1.7e1594p-2, 0x1.00abd0p+0 }, + { 0x1.861566p-2, 0x1.fe8fb0p-1 }, + { 0x1.8e0a02p-2, 0x1.fbbbbep-1 }, + { 0x1.95f336p-2, 0x1.f8dc0ap-1 }, + { 0x1.9dd0d2p-2, 0x1.f5f0cep-1 }, + { 0x1.a5a2acp-2, 0x1.f2fa4cp-1 }, + { 0x1.ad6896p-2, 0x1.eff8c4p-1 }, + { 0x1.b52264p-2, 0x1.ecec78p-1 }, + { 0x1.bccfecp-2, 0x1.e9d5a8p-1 }, + { 0x1.c47104p-2, 0x1.e6b498p-1 }, + { 0x1.cc0584p-2, 0x1.e38988p-1 }, + { 0x1.d38d44p-2, 0x1.e054bep-1 }, + { 0x1.db081cp-2, 0x1.dd167cp-1 }, + { 0x1.e275eap-2, 0x1.d9cf06p-1 }, + { 0x1.e9d68ap-2, 0x1.d67ea2p-1 }, + { 0x1.f129d4p-2, 0x1.d32592p-1 }, + { 0x1.f86faap-2, 0x1.cfc41ep-1 }, + { 0x1.ffa7eap-2, 0x1.cc5a8ap-1 }, + { 0x1.03693ap-1, 0x1.c8e91cp-1 }, + { 0x1.06f794p-1, 0x1.c5701ap-1 }, + { 0x1.0a7ef6p-1, 0x1.c1efcap-1 }, + { 0x1.0dff50p-1, 0x1.be6872p-1 }, + { 0x1.117894p-1, 0x1.bada5ap-1 }, + { 0x1.14eab4p-1, 0x1.b745c6p-1 }, + { 0x1.1855a6p-1, 0x1.b3aafcp-1 }, + { 0x1.1bb95cp-1, 0x1.b00a46p-1 }, + { 0x1.1f15ccp-1, 0x1.ac63e8p-1 }, + { 0x1.226ae8p-1, 0x1.a8b828p-1 }, + { 0x1.25b8a8p-1, 0x1.a5074ep-1 }, + { 0x1.28ff02p-1, 0x1.a1519ep-1 }, + { 0x1.2c3decp-1, 0x1.9d9762p-1 }, + { 0x1.2f755cp-1, 0x1.99d8dap-1 }, + { 0x1.32a54cp-1, 0x1.961650p-1 }, + { 0x1.35cdb4p-1, 0x1.925008p-1 }, + { 0x1.38ee8ap-1, 0x1.8e8646p-1 }, + { 0x1.3c07cap-1, 0x1.8ab950p-1 }, + { 0x1.3f196ep-1, 0x1.86e96ap-1 }, + { 0x1.42236ep-1, 0x1.8316d6p-1 }, + { 0x1.4525c8p-1, 0x1.7f41dcp-1 }, + { 0x1.482074p-1, 0x1.7b6abcp-1 }, + { 0x1.4b1372p-1, 0x1.7791b8p-1 }, + { 0x1.4dfebap-1, 0x1.73b714p-1 }, + { 0x1.50e24cp-1, 0x1.6fdb12p-1 }, + { 0x1.53be26p-1, 0x1.6bfdf0p-1 }, + { 0x1.569244p-1, 0x1.681ff2p-1 }, + { 0x1.595ea6p-1, 0x1.644156p-1 }, + { 0x1.5c2348p-1, 0x1.60625cp-1 }, + { 0x1.5ee02ep-1, 0x1.5c8342p-1 }, + { 0x1.619556p-1, 0x1.58a446p-1 }, + { 0x1.6442c0p-1, 0x1.54c5a6p-1 }, + { 0x1.66e86ep-1, 0x1.50e79ep-1 }, + { 0x1.69865ep-1, 0x1.4d0a68p-1 }, + { 0x1.6c1c98p-1, 0x1.492e42p-1 }, + { 0x1.6eab18p-1, 0x1.455366p-1 }, + { 0x1.7131e6p-1, 0x1.417a0cp-1 }, + { 0x1.73b102p-1, 0x1.3da26ep-1 }, + { 0x1.762870p-1, 0x1.39ccc2p-1 }, + { 0x1.789836p-1, 0x1.35f940p-1 }, + { 0x1.7b0058p-1, 0x1.32281ep-1 }, + { 0x1.7d60d8p-1, 0x1.2e5992p-1 }, + { 0x1.7fb9c0p-1, 0x1.2a8dcep-1 }, + { 0x1.820b12p-1, 0x1.26c508p-1 }, + { 0x1.8454d6p-1, 0x1.22ff72p-1 }, + { 0x1.869712p-1, 0x1.1f3d3cp-1 }, + { 0x1.88d1cep-1, 0x1.1b7e98p-1 }, + { 0x1.8b050ep-1, 0x1.17c3b6p-1 }, + { 0x1.8d30dep-1, 0x1.140cc4p-1 }, + { 0x1.8f5544p-1, 0x1.1059eep-1 }, + { 0x1.91724ap-1, 0x1.0cab62p-1 }, + { 0x1.9387f6p-1, 0x1.09014cp-1 }, + { 0x1.959652p-1, 0x1.055bd6p-1 }, + { 0x1.979d68p-1, 0x1.01bb2cp-1 }, + { 0x1.999d42p-1, 0x1.fc3ee6p-2 }, + { 0x1.9b95e8p-1, 0x1.f511aap-2 }, + { 0x1.9d8768p-1, 0x1.edeeeep-2 }, + { 0x1.9f71cap-1, 0x1.e6d700p-2 }, + { 0x1.a1551ap-1, 0x1.dfca26p-2 }, + { 0x1.a33162p-1, 0x1.d8c8aap-2 }, + { 0x1.a506b0p-1, 0x1.d1d2d0p-2 }, + { 0x1.a6d50cp-1, 0x1.cae8dap-2 }, + { 0x1.a89c86p-1, 0x1.c40b08p-2 }, + { 0x1.aa5d26p-1, 0x1.bd3998p-2 }, + { 0x1.ac16fcp-1, 0x1.b674c8p-2 }, + { 0x1.adca14p-1, 0x1.afbcd4p-2 }, + { 0x1.af767ap-1, 0x1.a911f0p-2 }, + { 0x1.b11c3cp-1, 0x1.a27456p-2 }, + { 0x1.b2bb68p-1, 0x1.9be438p-2 }, + { 0x1.b4540ap-1, 0x1.9561c8p-2 }, + { 0x1.b5e630p-1, 0x1.8eed36p-2 }, + { 0x1.b771e8p-1, 0x1.8886b2p-2 }, + { 0x1.b8f742p-1, 0x1.822e66p-2 }, + { 0x1.ba764ap-1, 0x1.7be47ap-2 }, + { 0x1.bbef10p-1, 0x1.75a91ap-2 }, + { 0x1.bd61a2p-1, 0x1.6f7c6ap-2 }, + { 0x1.bece0ep-1, 0x1.695e8cp-2 }, + { 0x1.c03464p-1, 0x1.634fa6p-2 }, + { 0x1.c194b2p-1, 0x1.5d4fd4p-2 }, + { 0x1.c2ef08p-1, 0x1.575f34p-2 }, + { 0x1.c44376p-1, 0x1.517de6p-2 }, + { 0x1.c5920ap-1, 0x1.4bac00p-2 }, + { 0x1.c6dad2p-1, 0x1.45e99cp-2 }, + { 0x1.c81de2p-1, 0x1.4036d0p-2 }, + { 0x1.c95b46p-1, 0x1.3a93b2p-2 }, + { 0x1.ca930ep-1, 0x1.350052p-2 }, + { 0x1.cbc54cp-1, 0x1.2f7cc4p-2 }, + { 0x1.ccf20cp-1, 0x1.2a0916p-2 }, + { 0x1.ce1962p-1, 0x1.24a554p-2 }, + { 0x1.cf3b5cp-1, 0x1.1f518ap-2 }, + { 0x1.d0580cp-1, 0x1.1a0dc6p-2 }, + { 0x1.d16f7ep-1, 0x1.14da0ap-2 }, + { 0x1.d281c4p-1, 0x1.0fb662p-2 }, + { 0x1.d38ef0p-1, 0x1.0aa2d0p-2 }, + { 0x1.d49710p-1, 0x1.059f5ap-2 }, + { 0x1.d59a34p-1, 0x1.00ac00p-2 }, + { 0x1.d6986cp-1, 0x1.f79184p-3 }, + { 0x1.d791cap-1, 0x1.edeb40p-3 }, + { 0x1.d8865ep-1, 0x1.e46530p-3 }, + { 0x1.d97636p-1, 0x1.daff4ap-3 }, + { 0x1.da6162p-1, 0x1.d1b982p-3 }, + { 0x1.db47f4p-1, 0x1.c893cep-3 }, + { 0x1.dc29fcp-1, 0x1.bf8e1cp-3 }, + { 0x1.dd0788p-1, 0x1.b6a856p-3 }, + { 0x1.dde0aap-1, 0x1.ade26cp-3 }, + { 0x1.deb570p-1, 0x1.a53c42p-3 }, + { 0x1.df85eap-1, 0x1.9cb5bep-3 }, + { 0x1.e0522ap-1, 0x1.944ec2p-3 }, + { 0x1.e11a3ep-1, 0x1.8c0732p-3 }, + { 0x1.e1de36p-1, 0x1.83deeap-3 }, + { 0x1.e29e22p-1, 0x1.7bd5c8p-3 }, + { 0x1.e35a12p-1, 0x1.73eba4p-3 }, + { 0x1.e41214p-1, 0x1.6c2056p-3 }, + { 0x1.e4c638p-1, 0x1.6473b6p-3 }, + { 0x1.e5768cp-1, 0x1.5ce596p-3 }, + { 0x1.e62322p-1, 0x1.5575c8p-3 }, + { 0x1.e6cc08p-1, 0x1.4e241ep-3 }, + { 0x1.e7714ap-1, 0x1.46f066p-3 }, + { 0x1.e812fcp-1, 0x1.3fda6cp-3 }, + { 0x1.e8b12ap-1, 0x1.38e1fap-3 }, + { 0x1.e94be4p-1, 0x1.3206dcp-3 }, + { 0x1.e9e336p-1, 0x1.2b48dap-3 }, + { 0x1.ea7730p-1, 0x1.24a7b8p-3 }, + { 0x1.eb07e2p-1, 0x1.1e233ep-3 }, + { 0x1.eb9558p-1, 0x1.17bb2cp-3 }, + { 0x1.ec1fa2p-1, 0x1.116f48p-3 }, + { 0x1.eca6ccp-1, 0x1.0b3f52p-3 }, + { 0x1.ed2ae6p-1, 0x1.052b0cp-3 }, + { 0x1.edabfcp-1, 0x1.fe6460p-4 }, + { 0x1.ee2a1ep-1, 0x1.f2a902p-4 }, + { 0x1.eea556p-1, 0x1.e72372p-4 }, + { 0x1.ef1db4p-1, 0x1.dbd32ap-4 }, + { 0x1.ef9344p-1, 0x1.d0b7a0p-4 }, + { 0x1.f00614p-1, 0x1.c5d04ap-4 }, + { 0x1.f07630p-1, 0x1.bb1c98p-4 }, + { 0x1.f0e3a6p-1, 0x1.b09bfcp-4 }, + { 0x1.f14e82p-1, 0x1.a64de6p-4 }, + { 0x1.f1b6d0p-1, 0x1.9c31c6p-4 }, + { 0x1.f21ca0p-1, 0x1.92470ap-4 }, + { 0x1.f27ff8p-1, 0x1.888d1ep-4 }, + { 0x1.f2e0eap-1, 0x1.7f036cp-4 }, + { 0x1.f33f7ep-1, 0x1.75a960p-4 }, + { 0x1.f39bc2p-1, 0x1.6c7e64p-4 }, + { 0x1.f3f5c2p-1, 0x1.6381e2p-4 }, + { 0x1.f44d88p-1, 0x1.5ab342p-4 }, + { 0x1.f4a31ep-1, 0x1.5211ecp-4 }, + { 0x1.f4f694p-1, 0x1.499d48p-4 }, + { 0x1.f547f2p-1, 0x1.4154bcp-4 }, + { 0x1.f59742p-1, 0x1.3937b2p-4 }, + { 0x1.f5e490p-1, 0x1.31458ep-4 }, + { 0x1.f62fe8p-1, 0x1.297dbap-4 }, + { 0x1.f67952p-1, 0x1.21df9ap-4 }, + { 0x1.f6c0dcp-1, 0x1.1a6a96p-4 }, + { 0x1.f7068cp-1, 0x1.131e14p-4 }, + { 0x1.f74a6ep-1, 0x1.0bf97ep-4 }, + { 0x1.f78c8cp-1, 0x1.04fc3ap-4 }, + { 0x1.f7cceep-1, 0x1.fc4b5ep-5 }, + { 0x1.f80ba2p-1, 0x1.eeea8cp-5 }, + { 0x1.f848acp-1, 0x1.e1d4d0p-5 }, + { 0x1.f8841ap-1, 0x1.d508fap-5 }, + { 0x1.f8bdf2p-1, 0x1.c885e0p-5 }, + { 0x1.f8f63ep-1, 0x1.bc4a54p-5 }, + { 0x1.f92d08p-1, 0x1.b05530p-5 }, + { 0x1.f96256p-1, 0x1.a4a54ap-5 }, + { 0x1.f99634p-1, 0x1.99397ap-5 }, + { 0x1.f9c8a8p-1, 0x1.8e109cp-5 }, + { 0x1.f9f9bap-1, 0x1.83298ep-5 }, + { 0x1.fa2974p-1, 0x1.78832cp-5 }, + { 0x1.fa57dep-1, 0x1.6e1c58p-5 }, + { 0x1.fa84fep-1, 0x1.63f3f6p-5 }, + { 0x1.fab0dep-1, 0x1.5a08e8p-5 }, + { 0x1.fadb84p-1, 0x1.505a18p-5 }, + { 0x1.fb04f6p-1, 0x1.46e66cp-5 }, + { 0x1.fb2d40p-1, 0x1.3dacd2p-5 }, + { 0x1.fb5464p-1, 0x1.34ac36p-5 }, + { 0x1.fb7a6cp-1, 0x1.2be38cp-5 }, + { 0x1.fb9f60p-1, 0x1.2351c2p-5 }, + { 0x1.fbc344p-1, 0x1.1af5d2p-5 }, + { 0x1.fbe61ep-1, 0x1.12ceb4p-5 }, + { 0x1.fc07fap-1, 0x1.0adb60p-5 }, + { 0x1.fc28d8p-1, 0x1.031ad6p-5 }, + { 0x1.fc48c2p-1, 0x1.f7182ap-6 }, + { 0x1.fc67bcp-1, 0x1.e85c44p-6 }, + { 0x1.fc85d0p-1, 0x1.da0006p-6 }, + { 0x1.fca2fep-1, 0x1.cc0180p-6 }, + { 0x1.fcbf52p-1, 0x1.be5ecep-6 }, + { 0x1.fcdaccp-1, 0x1.b1160ap-6 }, + { 0x1.fcf576p-1, 0x1.a4255ap-6 }, + { 0x1.fd0f54p-1, 0x1.978ae8p-6 }, + { 0x1.fd286ap-1, 0x1.8b44e6p-6 }, + { 0x1.fd40bep-1, 0x1.7f5188p-6 }, + { 0x1.fd5856p-1, 0x1.73af0cp-6 }, + { 0x1.fd6f34p-1, 0x1.685bb6p-6 }, + { 0x1.fd8562p-1, 0x1.5d55ccp-6 }, + { 0x1.fd9ae2p-1, 0x1.529b9ep-6 }, + { 0x1.fdafb8p-1, 0x1.482b84p-6 }, + { 0x1.fdc3e8p-1, 0x1.3e03d8p-6 }, + { 0x1.fdd77ap-1, 0x1.3422fep-6 }, + { 0x1.fdea6ep-1, 0x1.2a875cp-6 }, + { 0x1.fdfcccp-1, 0x1.212f62p-6 }, + { 0x1.fe0e96p-1, 0x1.181984p-6 }, + { 0x1.fe1fd0p-1, 0x1.0f443ep-6 }, + { 0x1.fe3080p-1, 0x1.06ae14p-6 }, + { 0x1.fe40a6p-1, 0x1.fcab14p-7 }, + { 0x1.fe504cp-1, 0x1.ec7262p-7 }, + { 0x1.fe5f70p-1, 0x1.dcaf36p-7 }, + { 0x1.fe6e18p-1, 0x1.cd5ecap-7 }, + { 0x1.fe7c46p-1, 0x1.be7e5ap-7 }, + { 0x1.fe8a00p-1, 0x1.b00b38p-7 }, + { 0x1.fe9748p-1, 0x1.a202bep-7 }, + { 0x1.fea422p-1, 0x1.94624ep-7 }, + { 0x1.feb090p-1, 0x1.87275ep-7 }, + { 0x1.febc96p-1, 0x1.7a4f6ap-7 }, + { 0x1.fec836p-1, 0x1.6dd7fep-7 }, + { 0x1.fed374p-1, 0x1.61beaep-7 }, + { 0x1.fede52p-1, 0x1.56011cp-7 }, + { 0x1.fee8d4p-1, 0x1.4a9cf6p-7 }, + { 0x1.fef2fep-1, 0x1.3f8ff6p-7 }, + { 0x1.fefccep-1, 0x1.34d7dcp-7 }, + { 0x1.ff064cp-1, 0x1.2a727ap-7 }, + { 0x1.ff0f76p-1, 0x1.205dacp-7 }, + { 0x1.ff1852p-1, 0x1.169756p-7 }, + { 0x1.ff20e0p-1, 0x1.0d1d6ap-7 }, + { 0x1.ff2924p-1, 0x1.03ede2p-7 }, + { 0x1.ff3120p-1, 0x1.f60d8ap-8 }, + { 0x1.ff38d6p-1, 0x1.e4cc4ap-8 }, + { 0x1.ff4048p-1, 0x1.d4143ap-8 }, + { 0x1.ff4778p-1, 0x1.c3e1a6p-8 }, + { 0x1.ff4e68p-1, 0x1.b430ecp-8 }, + { 0x1.ff551ap-1, 0x1.a4fe84p-8 }, + { 0x1.ff5b90p-1, 0x1.9646f4p-8 }, + { 0x1.ff61ccp-1, 0x1.8806d8p-8 }, + { 0x1.ff67d0p-1, 0x1.7a3adep-8 }, + { 0x1.ff6d9ep-1, 0x1.6cdfccp-8 }, + { 0x1.ff7338p-1, 0x1.5ff276p-8 }, + { 0x1.ff789ep-1, 0x1.536fc2p-8 }, + { 0x1.ff7dd4p-1, 0x1.4754acp-8 }, + { 0x1.ff82dap-1, 0x1.3b9e40p-8 }, + { 0x1.ff87b2p-1, 0x1.30499cp-8 }, + { 0x1.ff8c5cp-1, 0x1.2553eep-8 }, + { 0x1.ff90dcp-1, 0x1.1aba78p-8 }, + { 0x1.ff9532p-1, 0x1.107a8cp-8 }, + { 0x1.ff9960p-1, 0x1.06918cp-8 }, + { 0x1.ff9d68p-1, 0x1.f9f9d0p-9 }, + { 0x1.ffa14ap-1, 0x1.e77448p-9 }, + { 0x1.ffa506p-1, 0x1.d58da6p-9 }, + { 0x1.ffa8a0p-1, 0x1.c4412cp-9 }, + { 0x1.ffac18p-1, 0x1.b38a3ap-9 }, + { 0x1.ffaf6ep-1, 0x1.a36454p-9 }, + { 0x1.ffb2a6p-1, 0x1.93cb12p-9 }, + { 0x1.ffb5bep-1, 0x1.84ba30p-9 }, + { 0x1.ffb8b8p-1, 0x1.762d84p-9 }, + { 0x1.ffbb98p-1, 0x1.682100p-9 }, + { 0x1.ffbe5ap-1, 0x1.5a90b0p-9 }, + { 0x1.ffc102p-1, 0x1.4d78bcp-9 }, + { 0x1.ffc390p-1, 0x1.40d564p-9 }, + { 0x1.ffc606p-1, 0x1.34a306p-9 }, + { 0x1.ffc862p-1, 0x1.28de12p-9 }, + { 0x1.ffcaa8p-1, 0x1.1d8318p-9 }, + { 0x1.ffccd8p-1, 0x1.128ebap-9 }, + { 0x1.ffcef4p-1, 0x1.07fdb4p-9 }, + { 0x1.ffd0fap-1, 0x1.fb99b8p-10 }, + { 0x1.ffd2eap-1, 0x1.e7f232p-10 }, + { 0x1.ffd4cap-1, 0x1.d4fed8p-10 }, + { 0x1.ffd696p-1, 0x1.c2b9d0p-10 }, + { 0x1.ffd84ep-1, 0x1.b11d70p-10 }, + { 0x1.ffd9f8p-1, 0x1.a02436p-10 }, + { 0x1.ffdb90p-1, 0x1.8fc8c8p-10 }, + { 0x1.ffdd18p-1, 0x1.8005f0p-10 }, + { 0x1.ffde90p-1, 0x1.70d6a4p-10 }, + { 0x1.ffdffap-1, 0x1.6235fcp-10 }, + { 0x1.ffe154p-1, 0x1.541f34p-10 }, + { 0x1.ffe2a2p-1, 0x1.468daep-10 }, + { 0x1.ffe3e2p-1, 0x1.397ceep-10 }, + { 0x1.ffe514p-1, 0x1.2ce898p-10 }, + { 0x1.ffe63cp-1, 0x1.20cc76p-10 }, + { 0x1.ffe756p-1, 0x1.15246ep-10 }, + { 0x1.ffe866p-1, 0x1.09ec86p-10 }, + { 0x1.ffe96ap-1, 0x1.fe41cep-11 }, + { 0x1.ffea64p-1, 0x1.e97ba4p-11 }, + { 0x1.ffeb54p-1, 0x1.d57f52p-11 }, + { 0x1.ffec3ap-1, 0x1.c245d4p-11 }, + { 0x1.ffed16p-1, 0x1.afc85ep-11 }, + { 0x1.ffedeap-1, 0x1.9e0058p-11 }, + { 0x1.ffeeb4p-1, 0x1.8ce75ep-11 }, + { 0x1.ffef76p-1, 0x1.7c7744p-11 }, + { 0x1.fff032p-1, 0x1.6caa0ep-11 }, + { 0x1.fff0e4p-1, 0x1.5d79ecp-11 }, + { 0x1.fff18ep-1, 0x1.4ee142p-11 }, + { 0x1.fff232p-1, 0x1.40daa4p-11 }, + { 0x1.fff2d0p-1, 0x1.3360ccp-11 }, + { 0x1.fff366p-1, 0x1.266ea8p-11 }, + { 0x1.fff3f6p-1, 0x1.19ff46p-11 }, + { 0x1.fff480p-1, 0x1.0e0de8p-11 }, + { 0x1.fff504p-1, 0x1.0295f0p-11 }, + { 0x1.fff582p-1, 0x1.ef25d4p-12 }, + { 0x1.fff5fcp-1, 0x1.da0110p-12 }, + { 0x1.fff670p-1, 0x1.c5b542p-12 }, + { 0x1.fff6dep-1, 0x1.b23a5ap-12 }, + { 0x1.fff74ap-1, 0x1.9f8894p-12 }, + { 0x1.fff7aep-1, 0x1.8d986ap-12 }, + { 0x1.fff810p-1, 0x1.7c629ap-12 }, + { 0x1.fff86cp-1, 0x1.6be022p-12 }, + { 0x1.fff8c6p-1, 0x1.5c0a38p-12 }, + { 0x1.fff91cp-1, 0x1.4cda54p-12 }, + { 0x1.fff96cp-1, 0x1.3e4a24p-12 }, + { 0x1.fff9bap-1, 0x1.305390p-12 }, + { 0x1.fffa04p-1, 0x1.22f0b4p-12 }, + { 0x1.fffa4cp-1, 0x1.161be4p-12 }, + { 0x1.fffa90p-1, 0x1.09cfa4p-12 }, + { 0x1.fffad0p-1, 0x1.fc0d56p-13 }, + { 0x1.fffb0ep-1, 0x1.e577bcp-13 }, + { 0x1.fffb4ap-1, 0x1.cfd4a6p-13 }, + { 0x1.fffb82p-1, 0x1.bb1a96p-13 }, + { 0x1.fffbb8p-1, 0x1.a74068p-13 }, + { 0x1.fffbecp-1, 0x1.943d4ap-13 }, + { 0x1.fffc1ep-1, 0x1.8208bcp-13 }, + { 0x1.fffc4ep-1, 0x1.709a8ep-13 }, + { 0x1.fffc7ap-1, 0x1.5feadap-13 }, + { 0x1.fffca6p-1, 0x1.4ff208p-13 }, + { 0x1.fffccep-1, 0x1.40a8c2p-13 }, + { 0x1.fffcf6p-1, 0x1.3207fcp-13 }, + { 0x1.fffd1ap-1, 0x1.2408eap-13 }, + { 0x1.fffd3ep-1, 0x1.16a502p-13 }, + { 0x1.fffd60p-1, 0x1.09d5f8p-13 }, + { 0x1.fffd80p-1, 0x1.fb2b7ap-14 }, + { 0x1.fffda0p-1, 0x1.e3bcf4p-14 }, + { 0x1.fffdbep-1, 0x1.cd5528p-14 }, + { 0x1.fffddap-1, 0x1.b7e946p-14 }, + { 0x1.fffdf4p-1, 0x1.a36eecp-14 }, + { 0x1.fffe0ep-1, 0x1.8fdc1cp-14 }, + { 0x1.fffe26p-1, 0x1.7d2738p-14 }, + { 0x1.fffe3ep-1, 0x1.6b4702p-14 }, + { 0x1.fffe54p-1, 0x1.5a329cp-14 }, + { 0x1.fffe68p-1, 0x1.49e178p-14 }, + { 0x1.fffe7ep-1, 0x1.3a4b60p-14 }, + { 0x1.fffe90p-1, 0x1.2b6876p-14 }, + { 0x1.fffea2p-1, 0x1.1d3120p-14 }, + { 0x1.fffeb4p-1, 0x1.0f9e1cp-14 }, + { 0x1.fffec4p-1, 0x1.02a868p-14 }, + { 0x1.fffed4p-1, 0x1.ec929ap-15 }, + { 0x1.fffee4p-1, 0x1.d4f4b4p-15 }, + { 0x1.fffef2p-1, 0x1.be6abcp-15 }, + { 0x1.ffff00p-1, 0x1.a8e8ccp-15 }, + { 0x1.ffff0cp-1, 0x1.94637ep-15 }, + { 0x1.ffff18p-1, 0x1.80cfdcp-15 }, + { 0x1.ffff24p-1, 0x1.6e2368p-15 }, + { 0x1.ffff30p-1, 0x1.5c540cp-15 }, + { 0x1.ffff3ap-1, 0x1.4b581cp-15 }, + { 0x1.ffff44p-1, 0x1.3b2652p-15 }, + { 0x1.ffff4ep-1, 0x1.2bb5ccp-15 }, + { 0x1.ffff56p-1, 0x1.1cfe02p-15 }, + { 0x1.ffff60p-1, 0x1.0ef6c4p-15 }, + { 0x1.ffff68p-1, 0x1.019842p-15 }, + { 0x1.ffff70p-1, 0x1.e9b5e8p-16 }, + { 0x1.ffff78p-1, 0x1.d16f58p-16 }, + { 0x1.ffff7ep-1, 0x1.ba4f04p-16 }, + { 0x1.ffff84p-1, 0x1.a447b8p-16 }, + { 0x1.ffff8cp-1, 0x1.8f4cccp-16 }, + { 0x1.ffff92p-1, 0x1.7b5224p-16 }, + { 0x1.ffff98p-1, 0x1.684c22p-16 }, + { 0x1.ffff9cp-1, 0x1.562facp-16 }, + { 0x1.ffffa2p-1, 0x1.44f21ep-16 }, + { 0x1.ffffa6p-1, 0x1.34894ap-16 }, + { 0x1.ffffacp-1, 0x1.24eb72p-16 }, + { 0x1.ffffb0p-1, 0x1.160f44p-16 }, + { 0x1.ffffb4p-1, 0x1.07ebd2p-16 }, + { 0x1.ffffb8p-1, 0x1.f4f12ep-17 }, + { 0x1.ffffbcp-1, 0x1.db5ad0p-17 }, + { 0x1.ffffc0p-1, 0x1.c304f0p-17 }, + { 0x1.ffffc4p-1, 0x1.abe09ep-17 }, + { 0x1.ffffc6p-1, 0x1.95df98p-17 }, + { 0x1.ffffcap-1, 0x1.80f43ap-17 }, + { 0x1.ffffccp-1, 0x1.6d1178p-17 }, + { 0x1.ffffd0p-1, 0x1.5a2ae0p-17 }, + { 0x1.ffffd2p-1, 0x1.483488p-17 }, + { 0x1.ffffd4p-1, 0x1.372310p-17 }, + { 0x1.ffffd6p-1, 0x1.26eb9ep-17 }, + { 0x1.ffffd8p-1, 0x1.1783cep-17 }, + { 0x1.ffffdcp-1, 0x1.08e1bap-17 }, + { 0x1.ffffdep-1, 0x1.f5f7d8p-18 }, + { 0x1.ffffdep-1, 0x1.db92b6p-18 }, + { 0x1.ffffe0p-1, 0x1.c282cep-18 }, + { 0x1.ffffe2p-1, 0x1.aab7acp-18 }, + { 0x1.ffffe4p-1, 0x1.94219cp-18 }, + { 0x1.ffffe6p-1, 0x1.7eb1a2p-18 }, + { 0x1.ffffe8p-1, 0x1.6a5972p-18 }, + { 0x1.ffffe8p-1, 0x1.570b6ap-18 }, + { 0x1.ffffeap-1, 0x1.44ba86p-18 }, + { 0x1.ffffeap-1, 0x1.335a62p-18 }, + { 0x1.ffffecp-1, 0x1.22df2ap-18 }, + { 0x1.ffffeep-1, 0x1.133d96p-18 }, + { 0x1.ffffeep-1, 0x1.046aeap-18 }, + { 0x1.fffff0p-1, 0x1.ecb9d0p-19 }, + { 0x1.fffff0p-1, 0x1.d21398p-19 }, + { 0x1.fffff2p-1, 0x1.b8d094p-19 }, + { 0x1.fffff2p-1, 0x1.a0df10p-19 }, + { 0x1.fffff2p-1, 0x1.8a2e26p-19 }, + { 0x1.fffff4p-1, 0x1.74adc8p-19 }, + { 0x1.fffff4p-1, 0x1.604ea8p-19 }, + { 0x1.fffff4p-1, 0x1.4d0232p-19 }, + { 0x1.fffff6p-1, 0x1.3aba86p-19 }, + { 0x1.fffff6p-1, 0x1.296a70p-19 }, + { 0x1.fffff6p-1, 0x1.190562p-19 }, + { 0x1.fffff8p-1, 0x1.097f62p-19 }, + { 0x1.fffff8p-1, 0x1.f59a20p-20 }, + { 0x1.fffff8p-1, 0x1.d9c736p-20 }, + { 0x1.fffff8p-1, 0x1.bf716cp-20 }, + { 0x1.fffffap-1, 0x1.a6852cp-20 }, + { 0x1.fffffap-1, 0x1.8eefd8p-20 }, + { 0x1.fffffap-1, 0x1.789fb8p-20 }, + { 0x1.fffffap-1, 0x1.6383f8p-20 }, + { 0x1.fffffap-1, 0x1.4f8c96p-20 }, + { 0x1.fffffap-1, 0x1.3caa62p-20 }, + { 0x1.fffffcp-1, 0x1.2acee2p-20 }, + { 0x1.fffffcp-1, 0x1.19ec60p-20 }, + { 0x1.fffffcp-1, 0x1.09f5d0p-20 }, + { 0x1.fffffcp-1, 0x1.f5bd96p-21 }, + { 0x1.fffffcp-1, 0x1.d9371ep-21 }, + { 0x1.fffffcp-1, 0x1.be41dep-21 }, + { 0x1.fffffcp-1, 0x1.a4c89ep-21 }, + { 0x1.fffffcp-1, 0x1.8cb738p-21 }, + { 0x1.fffffep-1, 0x1.75fa8ep-21 }, + { 0x1.fffffep-1, 0x1.608078p-21 }, + { 0x1.fffffep-1, 0x1.4c37c0p-21 }, + { 0x1.fffffep-1, 0x1.39100ep-21 }, + { 0x1.fffffep-1, 0x1.26f9e0p-21 }, + { 0x1.fffffep-1, 0x1.15e682p-21 }, + { 0x1.fffffep-1, 0x1.05c804p-21 }, + { 0x1.fffffep-1, 0x1.ed2254p-22 }, + { 0x1.fffffep-1, 0x1.d06ad6p-22 }, + { 0x1.fffffep-1, 0x1.b551c8p-22 }, + { 0x1.fffffep-1, 0x1.9bc0a0p-22 }, + { 0x1.fffffep-1, 0x1.83a200p-22 }, + { 0x1.fffffep-1, 0x1.6ce1aap-22 }, + { 0x1.fffffep-1, 0x1.576c72p-22 }, + { 0x1.fffffep-1, 0x1.43302cp-22 }, + { 0x1.fffffep-1, 0x1.301ba2p-22 }, + { 0x1.fffffep-1, 0x1.1e1e86p-22 }, + { 0x1.fffffep-1, 0x1.0d2966p-22 }, + { 0x1.000000p+0, 0x1.fa5b50p-23 }, + { 0x1.000000p+0, 0x1.dc3ae4p-23 }, + { 0x1.000000p+0, 0x1.bfd756p-23 }, + { 0x1.000000p+0, 0x1.a517dap-23 }, + { 0x1.000000p+0, 0x1.8be4f8p-23 }, + { 0x1.000000p+0, 0x1.74287ep-23 }, + { 0x1.000000p+0, 0x1.5dcd66p-23 }, + { 0x1.000000p+0, 0x1.48bfd4p-23 }, + { 0x1.000000p+0, 0x1.34ecf8p-23 }, + { 0x1.000000p+0, 0x1.224310p-23 }, + { 0x1.000000p+0, 0x1.10b148p-23 }, + }, +}; diff --git a/math/aarch64/v_exp2f_1u.c b/math/aarch64/v_exp2f_1u.c deleted file mode 100644 index ba6b02fbb4bc..000000000000 --- a/math/aarch64/v_exp2f_1u.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const float Poly[] = { - /* maxerr: 0.878 ulp. */ - 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) -#define C5 v_f32 (Poly[5]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -static float32x4_t VPCS_ATTR NOINLINE -specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); - float32x4_t s2 = vreinterpretq_f32_u32 (e - b); - uint32x4_t cmp = absn > v_f32 (192.0f); - float32x4_t r1 = s1 * s1; - float32x4_t r0 = poly * s1 * s2; - return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) - | (~cmp & vreinterpretq_u32_f32 (r0))); -} - -float32x4_t VPCS_ATTR -_ZGVnN4v_exp2f_1u (float32x4_t x) -{ - float32x4_t n, r, scale, poly, absn; - uint32x4_t cmp, e; - - /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - float32x4_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = vreinterpretq_u32_f32 (z) << 23; -#else - n = vrndaq_f32 (x); - r = x - n; - e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; -#endif - scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); - absn = vabsq_f32 (n); - cmp = absn > v_f32 (126.0f); - poly = vfmaq_f32 (C1, C0, r); - poly = vfmaq_f32 (C2, poly, r); - poly = vfmaq_f32 (C3, poly, r); - poly = vfmaq_f32 (C4, poly, r); - poly = vfmaq_f32 (C5, poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c index 45f0848cac5b..59db77ac58cc 100644 --- a/math/aarch64/v_exp_data.c +++ b/math/aarch64/v_exp_data.c @@ -1,17 +1,14 @@ /* - * Lookup table for double-precision e^x vector function. + * Scale values for vector exp and exp2 * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "v_math.h" +#include "math_config.h" -# define N (1 << V_EXP_TABLE_BITS) - -/* 2^(j/N), j=0..N. */ +/* 2^(j/N), j=0..N, N=2^7=128. */ const uint64_t __v_exp_data[] = { -# if N == 128 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, @@ -55,92 +52,4 @@ const uint64_t __v_exp_data[] = { 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, -# elif N == 256 - 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, - 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, - 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, - 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, - 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, - 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, - 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, - 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, - 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, - 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, - 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, - 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, - 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, - 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, - 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, - 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, - 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, - 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, - 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, - 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, - 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, - 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, - 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, - 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, - 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, - 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, - 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, - 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, - 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, - 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, - 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, - 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, - 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, - 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, - 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, - 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, - 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, - 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, - 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, - 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, - 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, - 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, - 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, - 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, - 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, - 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, - 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, - 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, - 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, - 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, - 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, - 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, - 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, - 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, - 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, - 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, - 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, - 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, - 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, - 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, - 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, - 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, - 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, - 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, - 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, - 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, - 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, - 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, - 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, - 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, - 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, - 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, - 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, - 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, - 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, - 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, - 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, - 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, - 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, - 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, - 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, - 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, - 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, - 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, - 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, - 0x3feff9d96b2a23d9, -# endif }; diff --git a/math/aarch64/v_exp_tail_data.c b/math/aarch64/v_exp_tail_data.c new file mode 100644 index 000000000000..5cc58a40b6b7 --- /dev/null +++ b/math/aarch64/v_exp_tail_data.c @@ -0,0 +1,98 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* 2^(j/N), j=0..N, N=2^8=256. */ +const uint64_t __v_exp_tail_data[] = { + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +}; diff --git a/math/aarch64/v_expf_1u.c b/math/aarch64/v_expf_1u.c deleted file mode 100644 index 43d03fa34efa..000000000000 --- a/math/aarch64/v_expf_1u.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const float Poly[] = { - /* maxerr: 0.36565 +0.5 ulp. */ - 0x1.6a6000p-10f, - 0x1.12718ep-7f, - 0x1.555af0p-5f, - 0x1.555430p-3f, - 0x1.fffff4p-2f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -static float32x4_t VPCS_ATTR NOINLINE -specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); - float32x4_t s2 = vreinterpretq_f32_u32 (e - b); - uint32x4_t cmp = absn > v_f32 (192.0f); - float32x4_t r1 = s1 * s1; - float32x4_t r0 = poly * s1 * s2; - return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) - | (~cmp & vreinterpretq_u32_f32 (r0))); -} - -float32x4_t VPCS_ATTR -_ZGVnN4v_expf_1u (float32x4_t x) -{ - float32x4_t n, r, scale, poly, absn, z; - uint32x4_t cmp, e; - - /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = vfmaq_f32 (Shift, x, InvLn2); - n = z - Shift; - r = vfmaq_f32 (x, n, -Ln2hi); - r = vfmaq_f32 (r, n, -Ln2lo); - e = vreinterpretq_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = vrndaq_f32 (z); - r = vfmaq_f32 (x, n, -Ln2hi); - r = vfmaq_f32 (r, n, -Ln2lo); - e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; -#endif - scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); - absn = vabsq_f32 (n); - cmp = absn > v_f32 (126.0f); - poly = vfmaq_f32 (C1, C0, r); - poly = vfmaq_f32 (C2, poly, r); - poly = vfmaq_f32 (C3, poly, r); - poly = vfmaq_f32 (C4, poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c deleted file mode 100644 index 1d1c1fa62c04..000000000000 --- a/math/aarch64/v_log.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - uint64x2_t min_norm; - uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t ln2; - uint64x2_t sign_exp_mask; -} data = { - /* Worst-case error: 1.17 + 0.5 ulp. - Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), - V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), - V2 (-0x1.554e550bd501ep-3) }, - .ln2 = V2 (0x1.62e42fefa39efp-1), - .min_norm = V2 (0x0010000000000000), - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ - .sign_exp_mask = V2 (0xfff0000000000000) -}; - -#define A(i) d->poly[i] -#define N (1 << V_LOG_TABLE_BITS) -#define IndexMask (N - 1) -#define Off v_u64 (0x3fe6900900000000) - -struct entry -{ - float64x2_t invc; - float64x2_t logc; -}; - -static inline struct entry -lookup (uint64x2_t i) -{ - /* Since N is a power of 2, n % N = n & (N - 1). */ - struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); - float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); - e.invc = vuzp1q_f64 (e0, e1); - e.logc = vuzp2q_f64 (e0, e1); - return e; -} - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, - uint32x2_t cmp) -{ - return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); -} - -float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - float64x2_t z, r, r2, p, y, kd, hi; - uint64x2_t ix, iz, tmp; - uint32x2_t cmp; - int64x2_t k; - struct entry e; - - ix = vreinterpretq_u64_f64 (x); - cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = vsubq_u64 (ix, Off); - k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ - iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); - z = vreinterpretq_f64_u64 (iz); - e = lookup (tmp); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - kd = vcvtq_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = vmulq_f64 (r, r); - y = vfmaq_f64 (A (2), A (3), r); - p = vfmaq_f64 (A (0), A (1), r); - y = vfmaq_f64 (y, A (4), r2); - y = vfmaq_f64 (p, y, r2); - - if (unlikely (v_any_u32h (cmp))) - return special_case (x, y, hi, r2, cmp); - return vfmaq_f64 (hi, y, r2); -} diff --git a/math/aarch64/v_log10_data.c b/math/aarch64/v_log10_data.c new file mode 100644 index 000000000000..bae2685822f6 --- /dev/null +++ b/math/aarch64/v_log10_data.c @@ -0,0 +1,163 @@ +/* + * Lookup table for double-precision log10(x) vector function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct v_log10_data __v_log10_data = { + /* Computed from log's coefficients div by log(10) then rounded to double + precision. */ + .poly = { -0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4, + 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4 }, + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + /* Algorithm: + + x = 2^k z + log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup + tables: + + table[i].invc = 1/c + table[i].log10c = (double)log10(c) + + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. NB: invc should be optimized to minimize error in + (double)log10(c) instead. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.345825f221684p-3 }, + { 0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3 }, + { 0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3 }, + { 0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3 }, + { 0x1.623f1d916f323p+0, -0x1.20e7081762193p-3 }, + { 0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3 }, + { 0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3 }, + { 0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3 }, + { 0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3 }, + { 0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3 }, + { 0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3 }, + { 0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4 }, + { 0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4 }, + { 0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4 }, + { 0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4 }, + { 0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4 }, + { 0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4 }, + { 0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4 }, + { 0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4 }, + { 0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4 }, + { 0x1.446f12b278001p+0, -0x1.a56c091954f87p-4 }, + { 0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4 }, + { 0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4 }, + { 0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4 }, + { 0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4 }, + { 0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4 }, + { 0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4 }, + { 0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4 }, + { 0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4 }, + { 0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4 }, + { 0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4 }, + { 0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4 }, + { 0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4 }, + { 0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4 }, + { 0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4 }, + { 0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4 }, + { 0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4 }, + { 0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4 }, + { 0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4 }, + { 0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5 }, + { 0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5 }, + { 0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5 }, + { 0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5 }, + { 0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5 }, + { 0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5 }, + { 0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5 }, + { 0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5 }, + { 0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5 }, + { 0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5 }, + { 0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5 }, + { 0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5 }, + { 0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5 }, + { 0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5 }, + { 0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5 }, + { 0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6 }, + { 0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6 }, + { 0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6 }, + { 0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6 }, + { 0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6 }, + { 0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6 }, + { 0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6 }, + { 0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7 }, + { 0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7 }, + { 0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7 }, + { 0x1.062491aee9904p+0, -0x1.517249c15a75cp-7 }, + { 0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8 }, + { 0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8 }, + { 0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9 }, + { 0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9 }, + { 0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8 }, + { 0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7 }, + { 0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7 }, + { 0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6 }, + { 0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6 }, + { 0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6 }, + { 0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6 }, + { 0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6 }, + { 0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5 }, + { 0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5 }, + { 0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5 }, + { 0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5 }, + { 0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5 }, + { 0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5 }, + { 0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5 }, + { 0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5 }, + { 0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4 }, + { 0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4 }, + { 0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4 }, + { 0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4 }, + { 0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4 }, + { 0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4 }, + { 0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4 }, + { 0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4 }, + { 0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4 }, + { 0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4 }, + { 0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4 }, + { 0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4 }, + { 0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4 }, + { 0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4 }, + { 0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4 }, + { 0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4 }, + { 0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4 }, + { 0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4 }, + { 0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4 }, + { 0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4 }, + { 0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4 }, + { 0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3 }, + { 0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3 }, + { 0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3 }, + { 0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3 }, + { 0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3 }, + { 0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3 }, + { 0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3 }, + { 0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3 }, + { 0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3 }, + { 0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3 } } +}; diff --git a/math/aarch64/v_log2_data.c b/math/aarch64/v_log2_data.c new file mode 100644 index 000000000000..fad91d654da8 --- /dev/null +++ b/math/aarch64/v_log2_data.c @@ -0,0 +1,153 @@ +/* + * Coefficients and table entries for vector log2 + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_LOG2_TABLE_BITS) + +const struct v_log2_data __v_log2_data = { + + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2, + 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 }, + + .invln2 = 0x1.71547652b82fep0, + + /* Derived from tables in v_log_data.c in a similar way as v_log10_data.c. + This means invc is unchanged and log2c was calculated by scaling log(c) by + log2(e) in extended precision and rounding back to double precision. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 }, + { 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 }, + { 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 }, + { 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 }, + { 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 }, + { 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 }, + { 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 }, + { 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 }, + { 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 }, + { 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 }, + { 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 }, + { 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 }, + { 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 }, + { 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 }, + { 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 }, + { 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 }, + { 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 }, + { 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 }, + { 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 }, + { 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 }, + { 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 }, + { 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 }, + { 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 }, + { 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 }, + { 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 }, + { 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 }, + { 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 }, + { 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 }, + { 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 }, + { 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 }, + { 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 }, + { 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 }, + { 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 }, + { 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 }, + { 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 }, + { 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 }, + { 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 }, + { 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 }, + { 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 }, + { 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 }, + { 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 }, + { 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 }, + { 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 }, + { 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 }, + { 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 }, + { 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 }, + { 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 }, + { 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 }, + { 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 }, + { 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 }, + { 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 }, + { 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 }, + { 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 }, + { 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 }, + { 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 }, + { 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 }, + { 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 }, + { 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 }, + { 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 }, + { 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 }, + { 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 }, + { 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 }, + { 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 }, + { 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 }, + { 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 }, + { 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 }, + { 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 }, + { 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 }, + { 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 }, + { 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 }, + { 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 }, + { 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 }, + { 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 }, + { 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 }, + { 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 }, + { 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 }, + { 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 }, + { 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 }, + { 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 }, + { 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 }, + { 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 }, + { 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 }, + { 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 }, + { 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 }, + { 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 }, + { 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 }, + { 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 } } +}; diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c index 82351bb14766..4f0e6e167381 100644 --- a/math/aarch64/v_log_data.c +++ b/math/aarch64/v_log_data.c @@ -1,30 +1,35 @@ /* * Lookup table for double-precision log(x) vector function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "v_math.h" - -#define N (1 << V_LOG_TABLE_BITS) +#include "math_config.h" const struct v_log_data __v_log_data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 }, + .ln2 = 0x1.62e42fefa39efp-1, /* Algorithm: x = 2^k z log(x) = k ln2 + log(c) + poly(z/c - 1) - where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, - N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from two lookup + tables: table[i].invc = 1/c table[i].logc = (double)log(c) - where c is near the center of the subinterval and is chosen by trying several - floating point invc candidates around 1/center and selecting one for which - the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval - that contains 1 and the previous one got tweaked to avoid cancellation. */ + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. */ .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c deleted file mode 100644 index 66ebbbcd2b5a..000000000000 --- a/math/aarch64/v_logf.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - uint32x4_t min_norm; - uint16x8_t special_bound; - float32x4_t poly[7]; - float32x4_t ln2, tiny_bound; - uint32x4_t off, mantissa_mask; -} data = { - /* 3.34 ulp error. */ - .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), - V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), - V4 (-0x1.ffffc8p-2f) }, - .ln2 = V4 (0x1.62e43p-1f), - .tiny_bound = V4 (0x1p-126), - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ - .off = V4 (0x3f2aaaab), /* 0.666667. */ - .mantissa_mask = V4 (0x007fffff) -}; - -#define P(i) d->poly[7 - i] - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, - uint16x4_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); -} - -float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, p, q, r, r2, y; - uint32x4_t u; - uint16x4_t cmp; - - u = vreinterpretq_u32_f32 (x); - cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); - n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vandq_u32 (u, d->mantissa_mask); - u = vaddq_u32 (u, d->off); - r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - - /* y = log(1+r) + n*ln2. */ - r2 = vmulq_f32 (r, r); - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = vfmaq_f32 (P (5), P (6), r); - q = vfmaq_f32 (P (3), P (4), r); - y = vfmaq_f32 (P (1), P (2), r); - p = vfmaq_f32 (p, P (7), r2); - q = vfmaq_f32 (q, p, r2); - y = vfmaq_f32 (y, q, r2); - p = vfmaq_f32 (r, d->ln2, n); - - if (unlikely (v_any_u16h (cmp))) - return special_case (x, y, r2, p, cmp); - return vfmaq_f32 (p, y, r2); -} diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h deleted file mode 100644 index 1dc9916c6fb0..000000000000 --- a/math/aarch64/v_math.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#if !__aarch64__ -# error "Cannot build without AArch64" -#endif - -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) - -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f -#define V_NAME_D1(fun) _ZGVnN2v_##fun -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f -#define V_NAME_D2(fun) _ZGVnN2vv_##fun - -#include <stdint.h> -#include "../math_config.h" -#include <arm_neon.h> - -/* Shorthand helpers for declaring constants. */ -# define V2(X) { X, X } -# define V4(X) { X, X, X, X } -# define V8(X) { X, X, X, X, X, X, X, X } - -static inline int -v_any_u16h (uint16x4_t x) -{ - return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; -} - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline float32x4_t -v_f32 (float x) -{ - return (float32x4_t) V4 (x); -} -static inline uint32x4_t -v_u32 (uint32_t x) -{ - return (uint32x4_t) V4 (x); -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (uint32x4_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -static inline int -v_any_u32h (uint32x2_t x) -{ - return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; -} -static inline float32x4_t -v_lookup_f32 (const float *tab, uint32x4_t idx) -{ - return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline uint32x4_t -v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) -{ - return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline float32x4_t -v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) -{ - return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline float32x4_t -v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, - float32x4_t y, uint32x4_t p) -{ - return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], - p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], - p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline float64x2_t -v_f64 (double x) -{ - return (float64x2_t) V2 (x); -} -static inline uint64x2_t -v_u64 (uint64_t x) -{ - return (uint64x2_t) V2 (x); -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (uint64x2_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -static inline float64x2_t -v_lookup_f64 (const double *tab, uint64x2_t idx) -{ - return (float64x2_t){tab[idx[0]], tab[idx[1]]}; -} -static inline uint64x2_t -v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) -{ - return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; -} -static inline float64x2_t -v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) -{ - double p1 = p[1]; - double x1 = x[1]; - if (likely (p[0])) - y[0] = f (x[0]); - if (likely (p1)) - y[1] = f (x1); - return y; -} - -#endif diff --git a/math/aarch64/v_pow.c b/math/aarch64/v_pow.c deleted file mode 100644 index 734f1663a283..000000000000 --- a/math/aarch64/v_pow.c +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Double-precision vector pow function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) -{ - float64x2_t z; - for (int lane = 0; lane < v_lanes64 (); lane++) - { - double sx = x[lane]; - double sy = y[lane]; - double sz = pow (sx, sy); - z[lane] = sz; - } - return z; -} diff --git a/math/aarch64/v_pow_exp_data.c b/math/aarch64/v_pow_exp_data.c new file mode 100644 index 000000000000..db615ce94bd7 --- /dev/null +++ b/math/aarch64/v_pow_exp_data.c @@ -0,0 +1,289 @@ +/* + * Shared data between exp, exp2 and pow. + * + * Copyright (c) 2018-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_POW_EXP_TABLE_BITS) + +const struct v_pow_exp_data __v_pow_exp_data = { +// exp polynomial coefficients. +.poly = { +// abs error: 1.43*2^-58 +// ulp error: 0.549 (0.550 without fma) +// if |x| < ln2/512 +0x1.fffffffffffd4p-2, +0x1.5555571d6ef9p-3, +0x1.5555576a5adcep-5, +}, +// N/ln2 +.n_over_ln2 = 0x1.71547652b82fep0 * N, +// ln2/N +.ln2_over_n_hi = 0x1.62e42fefc0000p-9, +.ln2_over_n_lo = -0x1.c610ca86c3899p-45, +// Used for rounding to nearest integer without using intrinsics. +.shift = 0x1.8p52, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// sbits[k] = asuint64(H[k]) - (k << 52)/N +.sbits = { +0x3ff0000000000000, +0x3feffb1afa5abcbf, +0x3feff63da9fb3335, +0x3feff168143b0281, +0x3fefec9a3e778061, +0x3fefe7d42e11bbcc, +0x3fefe315e86e7f85, +0x3fefde5f72f654b1, +0x3fefd9b0d3158574, +0x3fefd50a0e3c1f89, +0x3fefd06b29ddf6de, +0x3fefcbd42b72a836, +0x3fefc74518759bc8, +0x3fefc2bdf66607e0, +0x3fefbe3ecac6f383, +0x3fefb9c79b1f3919, +0x3fefb5586cf9890f, +0x3fefb0f145e46c85, +0x3fefac922b7247f7, +0x3fefa83b23395dec, +0x3fefa3ec32d3d1a2, +0x3fef9fa55fdfa9c5, +0x3fef9b66affed31b, +0x3fef973028d7233e, +0x3fef9301d0125b51, +0x3fef8edbab5e2ab6, +0x3fef8abdc06c31cc, +0x3fef86a814f204ab, +0x3fef829aaea92de0, +0x3fef7e95934f312e, +0x3fef7a98c8a58e51, +0x3fef76a45471c3c2, +0x3fef72b83c7d517b, +0x3fef6ed48695bbc0, +0x3fef6af9388c8dea, +0x3fef672658375d2f, +0x3fef635beb6fcb75, +0x3fef5f99f8138a1c, +0x3fef5be084045cd4, +0x3fef582f95281c6b, +0x3fef54873168b9aa, +0x3fef50e75eb44027, +0x3fef4d5022fcd91d, +0x3fef49c18438ce4d, +0x3fef463b88628cd6, +0x3fef42be3578a819, +0x3fef3f49917ddc96, +0x3fef3bdda27912d1, +0x3fef387a6e756238, +0x3fef351ffb82140a, +0x3fef31ce4fb2a63f, +0x3fef2e85711ece75, +0x3fef2b4565e27cdd, +0x3fef280e341ddf29, +0x3fef24dfe1f56381, +0x3fef21ba7591bb70, +0x3fef1e9df51fdee1, +0x3fef1b8a66d10f13, +0x3fef187fd0dad990, +0x3fef157e39771b2f, +0x3fef1285a6e4030b, +0x3fef0f961f641589, +0x3fef0cafa93e2f56, +0x3fef09d24abd886b, +0x3fef06fe0a31b715, +0x3fef0432edeeb2fd, +0x3fef0170fc4cd831, +0x3feefeb83ba8ea32, +0x3feefc08b26416ff, +0x3feef96266e3fa2d, +0x3feef6c55f929ff1, +0x3feef431a2de883b, +0x3feef1a7373aa9cb, +0x3feeef26231e754a, +0x3feeecae6d05d866, +0x3feeea401b7140ef, +0x3feee7db34e59ff7, +0x3feee57fbfec6cf4, +0x3feee32dc313a8e5, +0x3feee0e544ede173, +0x3feedea64c123422, +0x3feedc70df1c5175, +0x3feeda4504ac801c, +0x3feed822c367a024, +0x3feed60a21f72e2a, +0x3feed3fb2709468a, +0x3feed1f5d950a897, +0x3feecffa3f84b9d4, +0x3feece086061892d, +0x3feecc2042a7d232, +0x3feeca41ed1d0057, +0x3feec86d668b3237, +0x3feec6a2b5c13cd0, +0x3feec4e1e192aed2, +0x3feec32af0d7d3de, +0x3feec17dea6db7d7, +0x3feebfdad5362a27, +0x3feebe41b817c114, +0x3feebcb299fddd0d, +0x3feebb2d81d8abff, +0x3feeb9b2769d2ca7, +0x3feeb8417f4531ee, +0x3feeb6daa2cf6642, +0x3feeb57de83f4eef, +0x3feeb42b569d4f82, +0x3feeb2e2f4f6ad27, +0x3feeb1a4ca5d920f, +0x3feeb070dde910d2, +0x3feeaf4736b527da, +0x3feeae27dbe2c4cf, +0x3feead12d497c7fd, +0x3feeac0827ff07cc, +0x3feeab07dd485429, +0x3feeaa11fba87a03, +0x3feea9268a5946b7, +0x3feea84590998b93, +0x3feea76f15ad2148, +0x3feea6a320dceb71, +0x3feea5e1b976dc09, +0x3feea52ae6cdf6f4, +0x3feea47eb03a5585, +0x3feea3dd1d1929fd, +0x3feea34634ccc320, +0x3feea2b9febc8fb7, +0x3feea23882552225, +0x3feea1c1c70833f6, +0x3feea155d44ca973, +0x3feea0f4b19e9538, +0x3feea09e667f3bcd, +0x3feea052fa75173e, +0x3feea012750bdabf, +0x3fee9fdcddd47645, +0x3fee9fb23c651a2f, +0x3fee9f9298593ae5, +0x3fee9f7df9519484, +0x3fee9f7466f42e87, +0x3fee9f75e8ec5f74, +0x3fee9f8286ead08a, +0x3fee9f9a48a58174, +0x3fee9fbd35d7cbfd, +0x3fee9feb564267c9, +0x3feea024b1ab6e09, +0x3feea0694fde5d3f, +0x3feea0b938ac1cf6, +0x3feea11473eb0187, +0x3feea17b0976cfdb, +0x3feea1ed0130c132, +0x3feea26a62ff86f0, +0x3feea2f336cf4e62, +0x3feea3878491c491, +0x3feea427543e1a12, +0x3feea4d2add106d9, +0x3feea589994cce13, +0x3feea64c1eb941f7, +0x3feea71a4623c7ad, +0x3feea7f4179f5b21, +0x3feea8d99b4492ed, +0x3feea9cad931a436, +0x3feeaac7d98a6699, +0x3feeabd0a478580f, +0x3feeace5422aa0db, +0x3feeae05bad61778, +0x3feeaf3216b5448c, +0x3feeb06a5e0866d9, +0x3feeb1ae99157736, +0x3feeb2fed0282c8a, +0x3feeb45b0b91ffc6, +0x3feeb5c353aa2fe2, +0x3feeb737b0cdc5e5, +0x3feeb8b82b5f98e5, +0x3feeba44cbc8520f, +0x3feebbdd9a7670b3, +0x3feebd829fde4e50, +0x3feebf33e47a22a2, +0x3feec0f170ca07ba, +0x3feec2bb4d53fe0d, +0x3feec49182a3f090, +0x3feec674194bb8d5, +0x3feec86319e32323, +0x3feeca5e8d07f29e, +0x3feecc667b5de565, +0x3feece7aed8eb8bb, +0x3feed09bec4a2d33, +0x3feed2c980460ad8, +0x3feed503b23e255d, +0x3feed74a8af46052, +0x3feed99e1330b358, +0x3feedbfe53c12e59, +0x3feede6b5579fdbf, +0x3feee0e521356eba, +0x3feee36bbfd3f37a, +0x3feee5ff3a3c2774, +0x3feee89f995ad3ad, +0x3feeeb4ce622f2ff, +0x3feeee07298db666, +0x3feef0ce6c9a8952, +0x3feef3a2b84f15fb, +0x3feef68415b749b1, +0x3feef9728de5593a, +0x3feefc6e29f1c52a, +0x3feeff76f2fb5e47, +0x3fef028cf22749e4, +0x3fef05b030a1064a, +0x3fef08e0b79a6f1f, +0x3fef0c1e904bc1d2, +0x3fef0f69c3f3a207, +0x3fef12c25bd71e09, +0x3fef16286141b33d, +0x3fef199bdd85529c, +0x3fef1d1cd9fa652c, +0x3fef20ab5fffd07a, +0x3fef244778fafb22, +0x3fef27f12e57d14b, +0x3fef2ba88988c933, +0x3fef2f6d9406e7b5, +0x3fef33405751c4db, +0x3fef3720dcef9069, +0x3fef3b0f2e6d1675, +0x3fef3f0b555dc3fa, +0x3fef43155b5bab74, +0x3fef472d4a07897c, +0x3fef4b532b08c968, +0x3fef4f87080d89f2, +0x3fef53c8eacaa1d6, +0x3fef5818dcfba487, +0x3fef5c76e862e6d3, +0x3fef60e316c98398, +0x3fef655d71ff6075, +0x3fef69e603db3285, +0x3fef6e7cd63a8315, +0x3fef7321f301b460, +0x3fef77d5641c0658, +0x3fef7c97337b9b5f, +0x3fef81676b197d17, +0x3fef864614f5a129, +0x3fef8b333b16ee12, +0x3fef902ee78b3ff6, +0x3fef953924676d76, +0x3fef9a51fbc74c83, +0x3fef9f7977cdb740, +0x3fefa4afa2a490da, +0x3fefa9f4867cca6e, +0x3fefaf482d8e67f1, +0x3fefb4aaa2188510, +0x3fefba1bee615a27, +0x3fefbf9c1cb6412a, +0x3fefc52b376bba97, +0x3fefcac948dd7274, +0x3fefd0765b6e4540, +0x3fefd632798844f8, +0x3fefdbfdad9cbe14, +0x3fefe1d802243c89, +0x3fefe7c1819e90d8, +0x3fefedba3692d514, +0x3feff3c22b8f71f1, +0x3feff9d96b2a23d9, +}, +}; diff --git a/math/aarch64/v_pow_log_data.c b/math/aarch64/v_pow_log_data.c new file mode 100644 index 000000000000..7df277f74e4f --- /dev/null +++ b/math/aarch64/v_pow_log_data.c @@ -0,0 +1,174 @@ +/* + * Data for the log part of pow. + * + * Copyright (c) 2018-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_POW_LOG_TABLE_BITS) + +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + + where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals + and z falls into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = round(0x1p43*log(c))/0x1p43 + tab[i].logctail = (double)(log(c) - logc) + + where c is chosen near the center of the subinterval such that 1/c has only + a few precision bits so z/c - 1 is exactly representible as double: + + 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2 + + Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < + 0x1p-97, the last few bits of logc are rounded away so k*ln2hi + logc has no + rounding error and the interval for z is selected such that near x == 1, + where log(x) + is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */ +const struct v_pow_log_data __v_pow_log_data = { + /* relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .poly = { -0x1p-1, -0x1.555555555556p-1, 0x1.0000000000006p-1, + 0x1.999999959554ep-1, -0x1.555555529a47ap-1, -0x1.2495b9b4845e9p0, + 0x1.0002b8b263fc3p0, }, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .invc = { 0x1.6a00000000000p+0, 0x1.6800000000000p+0, 0x1.6600000000000p+0, + 0x1.6400000000000p+0, 0x1.6200000000000p+0, 0x1.6000000000000p+0, + 0x1.5e00000000000p+0, 0x1.5c00000000000p+0, 0x1.5a00000000000p+0, + 0x1.5800000000000p+0, 0x1.5600000000000p+0, 0x1.5600000000000p+0, + 0x1.5400000000000p+0, 0x1.5200000000000p+0, 0x1.5000000000000p+0, + 0x1.4e00000000000p+0, 0x1.4c00000000000p+0, 0x1.4a00000000000p+0, + 0x1.4a00000000000p+0, 0x1.4800000000000p+0, 0x1.4600000000000p+0, + 0x1.4400000000000p+0, 0x1.4200000000000p+0, 0x1.4000000000000p+0, + 0x1.4000000000000p+0, 0x1.3e00000000000p+0, 0x1.3c00000000000p+0, + 0x1.3a00000000000p+0, 0x1.3a00000000000p+0, 0x1.3800000000000p+0, + 0x1.3600000000000p+0, 0x1.3400000000000p+0, 0x1.3400000000000p+0, + 0x1.3200000000000p+0, 0x1.3000000000000p+0, 0x1.3000000000000p+0, + 0x1.2e00000000000p+0, 0x1.2c00000000000p+0, 0x1.2c00000000000p+0, + 0x1.2a00000000000p+0, 0x1.2800000000000p+0, 0x1.2600000000000p+0, + 0x1.2600000000000p+0, 0x1.2400000000000p+0, 0x1.2400000000000p+0, + 0x1.2200000000000p+0, 0x1.2000000000000p+0, 0x1.2000000000000p+0, + 0x1.1e00000000000p+0, 0x1.1c00000000000p+0, 0x1.1c00000000000p+0, + 0x1.1a00000000000p+0, 0x1.1a00000000000p+0, 0x1.1800000000000p+0, + 0x1.1600000000000p+0, 0x1.1600000000000p+0, 0x1.1400000000000p+0, + 0x1.1400000000000p+0, 0x1.1200000000000p+0, 0x1.1000000000000p+0, + 0x1.1000000000000p+0, 0x1.0e00000000000p+0, 0x1.0e00000000000p+0, + 0x1.0c00000000000p+0, 0x1.0c00000000000p+0, 0x1.0a00000000000p+0, + 0x1.0a00000000000p+0, 0x1.0800000000000p+0, 0x1.0800000000000p+0, + 0x1.0600000000000p+0, 0x1.0400000000000p+0, 0x1.0400000000000p+0, + 0x1.0200000000000p+0, 0x1.0200000000000p+0, 0x1.0000000000000p+0, + 0x1.0000000000000p+0, 0x1.fc00000000000p-1, 0x1.f800000000000p-1, + 0x1.f400000000000p-1, 0x1.f000000000000p-1, 0x1.ec00000000000p-1, + 0x1.e800000000000p-1, 0x1.e400000000000p-1, 0x1.e200000000000p-1, + 0x1.de00000000000p-1, 0x1.da00000000000p-1, 0x1.d600000000000p-1, + 0x1.d400000000000p-1, 0x1.d000000000000p-1, 0x1.cc00000000000p-1, + 0x1.ca00000000000p-1, 0x1.c600000000000p-1, 0x1.c400000000000p-1, + 0x1.c000000000000p-1, 0x1.be00000000000p-1, 0x1.ba00000000000p-1, + 0x1.b800000000000p-1, 0x1.b400000000000p-1, 0x1.b200000000000p-1, + 0x1.ae00000000000p-1, 0x1.ac00000000000p-1, 0x1.aa00000000000p-1, + 0x1.a600000000000p-1, 0x1.a400000000000p-1, 0x1.a000000000000p-1, + 0x1.9e00000000000p-1, 0x1.9c00000000000p-1, 0x1.9a00000000000p-1, + 0x1.9600000000000p-1, 0x1.9400000000000p-1, 0x1.9200000000000p-1, + 0x1.9000000000000p-1, 0x1.8c00000000000p-1, 0x1.8a00000000000p-1, + 0x1.8800000000000p-1, 0x1.8600000000000p-1, 0x1.8400000000000p-1, + 0x1.8200000000000p-1, 0x1.7e00000000000p-1, 0x1.7c00000000000p-1, + 0x1.7a00000000000p-1, 0x1.7800000000000p-1, 0x1.7600000000000p-1, + 0x1.7400000000000p-1, 0x1.7200000000000p-1, 0x1.7000000000000p-1, + 0x1.6e00000000000p-1, 0x1.6c00000000000p-1, }, + .logc + = { -0x1.62c82f2b9c800p-2, -0x1.5d1bdbf580800p-2, -0x1.5767717455800p-2, + -0x1.51aad872df800p-2, -0x1.4be5f95777800p-2, -0x1.4618bc21c6000p-2, + -0x1.404308686a800p-2, -0x1.3a64c55694800p-2, -0x1.347dd9a988000p-2, + -0x1.2e8e2bae12000p-2, -0x1.2895a13de8800p-2, -0x1.2895a13de8800p-2, + -0x1.22941fbcf7800p-2, -0x1.1c898c1699800p-2, -0x1.1675cababa800p-2, + -0x1.1058bf9ae4800p-2, -0x1.0a324e2739000p-2, -0x1.0402594b4d000p-2, + -0x1.0402594b4d000p-2, -0x1.fb9186d5e4000p-3, -0x1.ef0adcbdc6000p-3, + -0x1.e27076e2af000p-3, -0x1.d5c216b4fc000p-3, -0x1.c8ff7c79aa000p-3, + -0x1.c8ff7c79aa000p-3, -0x1.bc286742d9000p-3, -0x1.af3c94e80c000p-3, + -0x1.a23bc1fe2b000p-3, -0x1.a23bc1fe2b000p-3, -0x1.9525a9cf45000p-3, + -0x1.87fa06520d000p-3, -0x1.7ab890210e000p-3, -0x1.7ab890210e000p-3, + -0x1.6d60fe719d000p-3, -0x1.5ff3070a79000p-3, -0x1.5ff3070a79000p-3, + -0x1.526e5e3a1b000p-3, -0x1.44d2b6ccb8000p-3, -0x1.44d2b6ccb8000p-3, + -0x1.371fc201e9000p-3, -0x1.29552f81ff000p-3, -0x1.1b72ad52f6000p-3, + -0x1.1b72ad52f6000p-3, -0x1.0d77e7cd09000p-3, -0x1.0d77e7cd09000p-3, + -0x1.fec9131dbe000p-4, -0x1.e27076e2b0000p-4, -0x1.e27076e2b0000p-4, + -0x1.c5e548f5bc000p-4, -0x1.a926d3a4ae000p-4, -0x1.a926d3a4ae000p-4, + -0x1.8c345d631a000p-4, -0x1.8c345d631a000p-4, -0x1.6f0d28ae56000p-4, + -0x1.51b073f062000p-4, -0x1.51b073f062000p-4, -0x1.341d7961be000p-4, + -0x1.341d7961be000p-4, -0x1.16536eea38000p-4, -0x1.f0a30c0118000p-5, + -0x1.f0a30c0118000p-5, -0x1.b42dd71198000p-5, -0x1.b42dd71198000p-5, + -0x1.77458f632c000p-5, -0x1.77458f632c000p-5, -0x1.39e87b9fec000p-5, + -0x1.39e87b9fec000p-5, -0x1.f829b0e780000p-6, -0x1.f829b0e780000p-6, + -0x1.7b91b07d58000p-6, -0x1.fc0a8b0fc0000p-7, -0x1.fc0a8b0fc0000p-7, + -0x1.fe02a6b100000p-8, -0x1.fe02a6b100000p-8, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, 0x1.0101575890000p-7, 0x1.0205658938000p-6, + 0x1.8492528c90000p-6, 0x1.0415d89e74000p-5, 0x1.466aed42e0000p-5, + 0x1.894aa149fc000p-5, 0x1.ccb73cdddc000p-5, 0x1.eea31c006c000p-5, + 0x1.1973bd1466000p-4, 0x1.3bdf5a7d1e000p-4, 0x1.5e95a4d97a000p-4, + 0x1.700d30aeac000p-4, 0x1.9335e5d594000p-4, 0x1.b6ac88dad6000p-4, + 0x1.c885801bc4000p-4, 0x1.ec739830a2000p-4, 0x1.fe89139dbe000p-4, + 0x1.1178e8227e000p-3, 0x1.1aa2b7e23f000p-3, 0x1.2d1610c868000p-3, + 0x1.365fcb0159000p-3, 0x1.4913d8333b000p-3, 0x1.527e5e4a1b000p-3, + 0x1.6574ebe8c1000p-3, 0x1.6f0128b757000p-3, 0x1.7898d85445000p-3, + 0x1.8beafeb390000p-3, 0x1.95a5adcf70000p-3, 0x1.a93ed3c8ae000p-3, + 0x1.b31d8575bd000p-3, 0x1.bd087383be000p-3, 0x1.c6ffbc6f01000p-3, + 0x1.db13db0d49000p-3, 0x1.e530effe71000p-3, 0x1.ef5ade4dd0000p-3, + 0x1.f991c6cb3b000p-3, 0x1.07138604d5800p-2, 0x1.0c42d67616000p-2, + 0x1.1178e8227e800p-2, 0x1.16b5ccbacf800p-2, 0x1.1bf99635a6800p-2, + 0x1.214456d0eb800p-2, 0x1.2bef07cdc9000p-2, 0x1.314f1e1d36000p-2, + 0x1.36b6776be1000p-2, 0x1.3c25277333000p-2, 0x1.419b423d5e800p-2, + 0x1.4718dc271c800p-2, 0x1.4c9e09e173000p-2, 0x1.522ae0738a000p-2, + 0x1.57bf753c8d000p-2, 0x1.5d5bddf596000p-2, }, + .logctail + = { 0x1.ab42428375680p-48, -0x1.ca508d8e0f720p-46, -0x1.362a4d5b6506dp-45, + -0x1.684e49eb067d5p-49, -0x1.41b6993293ee0p-47, 0x1.3d82f484c84ccp-46, + 0x1.c42f3ed820b3ap-50, 0x1.0b1c686519460p-45, 0x1.5594dd4c58092p-45, + 0x1.67b1e99b72bd8p-45, 0x1.5ca14b6cfb03fp-46, 0x1.5ca14b6cfb03fp-46, + -0x1.65a242853da76p-46, -0x1.fafbc68e75404p-46, 0x1.f1fc63382a8f0p-46, + -0x1.6a8c4fd055a66p-45, -0x1.c6bee7ef4030ep-47, -0x1.036b89ef42d7fp-48, + -0x1.036b89ef42d7fp-48, 0x1.d572aab993c87p-47, 0x1.b26b79c86af24p-45, + -0x1.72f4f543fff10p-46, 0x1.1ba91bbca681bp-45, 0x1.7794f689f8434p-45, + 0x1.7794f689f8434p-45, 0x1.94eb0318bb78fp-46, 0x1.a4e633fcd9066p-52, + -0x1.58c64dc46c1eap-45, -0x1.58c64dc46c1eap-45, -0x1.ad1d904c1d4e3p-45, + 0x1.bbdbf7fdbfa09p-45, 0x1.bdb9072534a58p-45, 0x1.bdb9072534a58p-45, + -0x1.0e46aa3b2e266p-46, -0x1.e9e439f105039p-46, -0x1.e9e439f105039p-46, + -0x1.0de8b90075b8fp-45, 0x1.70cc16135783cp-46, 0x1.70cc16135783cp-46, + 0x1.178864d27543ap-48, -0x1.48d301771c408p-45, -0x1.e80a41811a396p-45, + -0x1.e80a41811a396p-45, 0x1.a699688e85bf4p-47, 0x1.a699688e85bf4p-47, + -0x1.575545ca333f2p-45, 0x1.a342c2af0003cp-45, 0x1.a342c2af0003cp-45, + -0x1.d0c57585fbe06p-46, 0x1.53935e85baac8p-45, 0x1.53935e85baac8p-45, + 0x1.37c294d2f5668p-46, 0x1.37c294d2f5668p-46, -0x1.69737c93373dap-45, + 0x1.f025b61c65e57p-46, 0x1.f025b61c65e57p-46, 0x1.c5edaccf913dfp-45, + 0x1.c5edaccf913dfp-45, 0x1.47c5e768fa309p-46, 0x1.d599e83368e91p-45, + 0x1.d599e83368e91p-45, 0x1.c827ae5d6704cp-46, 0x1.c827ae5d6704cp-46, + -0x1.cfc4634f2a1eep-45, -0x1.cfc4634f2a1eep-45, 0x1.502b7f526feaap-48, + 0x1.502b7f526feaap-48, -0x1.980267c7e09e4p-45, -0x1.980267c7e09e4p-45, + -0x1.88d5493faa639p-45, -0x1.f1e7cf6d3a69cp-50, -0x1.f1e7cf6d3a69cp-50, + -0x1.9e23f0dda40e4p-46, -0x1.9e23f0dda40e4p-46, 0x0.0000000000000p+0, + 0x0.0000000000000p+0, -0x1.0c76b999d2be8p-46, -0x1.3dc5b06e2f7d2p-45, + -0x1.aa0ba325a0c34p-45, 0x1.111c05cf1d753p-47, -0x1.c167375bdfd28p-45, + -0x1.97995d05a267dp-46, -0x1.a68f247d82807p-46, -0x1.e113e4fc93b7bp-47, + -0x1.5325d560d9e9bp-45, 0x1.cc85ea5db4ed7p-45, -0x1.c69063c5d1d1ep-45, + 0x1.c1e8da99ded32p-49, 0x1.3115c3abd47dap-45, -0x1.390802bf768e5p-46, + 0x1.646d1c65aacd3p-45, -0x1.dc068afe645e0p-45, -0x1.534d64fa10afdp-45, + 0x1.1ef78ce2d07f2p-45, 0x1.ca78e44389934p-45, 0x1.39d6ccb81b4a1p-47, + 0x1.62fa8234b7289p-51, 0x1.5837954fdb678p-45, 0x1.633e8e5697dc7p-45, + 0x1.9cf8b2c3c2e78p-46, -0x1.5118de59c21e1p-45, -0x1.c661070914305p-46, + -0x1.73d54aae92cd1p-47, 0x1.7f22858a0ff6fp-47, -0x1.8724350562169p-45, + -0x1.c358d4eace1aap-47, -0x1.d4bc4595412b6p-45, -0x1.1ec72c5962bd2p-48, + -0x1.aff2af715b035p-45, 0x1.212276041f430p-51, -0x1.a211565bb8e11p-51, + 0x1.bcbecca0cdf30p-46, 0x1.89cdb16ed4e91p-48, 0x1.7188b163ceae9p-45, + -0x1.c210e63a5f01cp-45, 0x1.b9acdf7a51681p-45, 0x1.ca6ed5147bdb7p-45, + 0x1.a87deba46baeap-47, 0x1.a9cfa4a5004f4p-45, -0x1.8e27ad3213cb8p-45, + 0x1.16ecdb0f177c8p-46, 0x1.83b54b606bd5cp-46, 0x1.8e436ec90e09dp-47, + -0x1.f27ce0967d675p-45, -0x1.e20891b0ad8a4p-45, 0x1.ebe708164c759p-45, + 0x1.fadedee5d40efp-46, -0x1.a0b2a08a465dcp-47, }, +}; diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c deleted file mode 100644 index 3a4163ab0558..000000000000 --- a/math/aarch64/v_powf.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Thresh v_u32 (0x7f000000) /* Max - Min. */ -#define MantissaMask v_u32 (0x007fffff) - -#define A data.log2_poly -#define C data.exp2f_poly - -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ -#define Off v_u32 (0x3f35d000) - -#define V_POWF_LOG2_TABLE_BITS 5 -#define V_EXP2F_TABLE_BITS 5 -#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) -#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) - -static const struct -{ - struct - { - double invc, logc; - } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; - double log2_poly[4]; - uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; - double exp2f_poly[3]; -} data = { - .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, - {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, - {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, - {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, - {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, - {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, - {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, - {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, - {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, - {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, - {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, - {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, - {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, - {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, - {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, - {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, - {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, - {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, - {0x1p+0, 0x0p+0 * Scale}, - {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, - {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, - {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, - {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, - {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, - {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, - {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, - {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, - {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, - {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, - {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, - {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, - {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, - .log2_poly = { /* rel err: 1.5 * 2^-30. */ - -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, - -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, - .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, - 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, - 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, - 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, - 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, - 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, - 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, - 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, - 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, - 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, - 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, - .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ - 0x1.c6af84b912394p-5 / Scale / Scale / Scale, - 0x1.ebfce50fac4f3p-3 / Scale / Scale, - 0x1.62e42ff0c52d6p-1 / Scale}}; - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) -{ - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); - uint32x4_t tmp = vsubq_u32 (u, Off); - uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), - Log2IdxMask); - uint32x4_t top = vbicq_u32 (tmp, MantissaMask); - uint32x4_t iz = vsubq_u32 (u, top); - int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), - 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ - - float32x4_t ret; - for (int lane = 0; lane < 4; lane++) - { - /* Use double precision for each lane. */ - double invc = data.log2_tab[i[lane]].invc; - double logc = data.log2_tab[i[lane]].logc; - double z = (double) asfloat (iz[lane]); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ - double r = __builtin_fma (z, invc, -1.0); - double y0 = logc + (double) k[lane]; - - /* Polynomial to approximate log1p(r)/ln2. */ - double logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; - double ylogx = y[lane] * logx; - cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) - >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 - ? 1 - : cmp[lane]; - - /* N*x = k + r with r in [-1/2, 1/2]. */ - double kd = round (ylogx); - uint64_t ki = lround (ylogx); - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ - uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; - t += ki << (52 - V_EXP2F_TABLE_BITS); - double s = asdouble (t); - double p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - ret[lane] = p; - } - if (unlikely (v_any_u32 (cmp))) - return special_case (x, y, ret, cmp); - return ret; -} diff --git a/math/aarch64/v_powf_data.c b/math/aarch64/v_powf_data.c new file mode 100644 index 000000000000..5cf1b8769414 --- /dev/null +++ b/math/aarch64/v_powf_data.c @@ -0,0 +1,89 @@ +/* + * Coefficients for single-precision SVE pow(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct v_powf_data __v_powf_data = { + .invc = { 0x1.6489890582816p+0, + 0x1.5cf19b35e3472p+0, + 0x1.55aac0e956d65p+0, + 0x1.4eb0022977e01p+0, + 0x1.47fcccda1dd1fp+0, + 0x1.418ceabab68c1p+0, + 0x1.3b5c788f1edb3p+0, + 0x1.3567de48e9c9ap+0, + 0x1.2fabc80fd19bap+0, + 0x1.2a25200ce536bp+0, + 0x1.24d108e0152e3p+0, + 0x1.1facd8ab2fbe1p+0, + 0x1.1ab614a03efdfp+0, + 0x1.15ea6d03af9ffp+0, + 0x1.1147b994bb776p+0, + 0x1.0ccbf650593aap+0, + 0x1.0875408477302p+0, + 0x1.0441d42a93328p+0, + 0x1p+0, + 0x1.f1d006c855e86p-1, + 0x1.e28c3341aa301p-1, + 0x1.d4bdf9aa64747p-1, + 0x1.c7b45a24e5803p-1, + 0x1.bb5f5eb2ed60ap-1, + 0x1.afb0bff8fe6b4p-1, + 0x1.a49badf7ab1f5p-1, + 0x1.9a14a111fc4c9p-1, + 0x1.901131f5b2fdcp-1, + 0x1.8687f73f6d865p-1, + 0x1.7d7067eb77986p-1, + 0x1.74c2c1cf97b65p-1, + 0x1.6c77f37cff2a1p-1 + }, + .logc = { -0x1.e960f97b22702p+3, + -0x1.c993406cd4db6p+3, + -0x1.aa711d9a7d0f3p+3, + -0x1.8bf37bacdce9bp+3, + -0x1.6e13b3519946ep+3, + -0x1.50cb8281e4089p+3, + -0x1.341504a237e2bp+3, + -0x1.17eaab624ffbbp+3, + -0x1.f88e708f8c853p+2, + -0x1.c24b6da113914p+2, + -0x1.8d02ee397cb1dp+2, + -0x1.58ac1223408b3p+2, + -0x1.253e6fd190e89p+2, + -0x1.e5641882c12ffp+1, + -0x1.81fea712926f7p+1, + -0x1.203e240de64a3p+1, + -0x1.8029b86a78281p0, + -0x1.85d713190fb9p-1, + 0x0p+0, + 0x1.4c1cc07312997p0, + 0x1.5e1848ccec948p+1, + 0x1.04cfcb7f1196fp+2, + 0x1.582813d463c21p+2, + 0x1.a936fa68760ccp+2, + 0x1.f81bc31d6cc4ep+2, + 0x1.2279a09fae6b1p+3, + 0x1.47ec0b6df5526p+3, + 0x1.6c71762280f1p+3, + 0x1.90155070798dap+3, + 0x1.b2e23b1d3068cp+3, + 0x1.d4e21b0daa86ap+3, + 0x1.f61e2a2f67f3fp+3 + }, + .scale = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, +}; |