From: Dylan Fleming Date: Tue, 18 Nov 2025 15:34:41 +0000 (+0000) Subject: AArch64: Remove WANT_SIMD_EXCEPT from aarch64 AdvSIMD math routines X-Git-Url: https://www.sourceware.org/git/?a=commitdiff_plain;h=fd1d642ef80d81fdd4a22c628183a0bdc0a0c23e;p=glibc.git AArch64: Remove WANT_SIMD_EXCEPT from aarch64 AdvSIMD math routines Remove legacy code for supporting an old Arm Optimised Routines deprecated feature for throwing SIMD Exceptions. Reviewed-by: Adhemerval Zanella --- diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c index 453f780314..7ccde9c81c 100644 --- a/sysdeps/aarch64/fpu/acos_advsimd.c +++ b/sysdeps/aarch64/fpu/acos_advsimd.c @@ -38,26 +38,9 @@ static const struct data .abs_mask = V2 (0x7fffffffffffffff), }; -#define AllMask v_u64 (0xffffffffffffffff) -#define Oneu 0x3ff0000000000000 -#define Small 0x3e50000000000000 /* 2^-53. */ - -#if WANT_SIMD_EXCEPT -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) -{ - return v_call_f64 (acos, x, y, special); -} -#endif - /* Double-precision implementation of vector acos(x). - For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct - rounding. - If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following - approximation. - - For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + For |x| in [0, 0.5], use an order 11 polynomial P such that the final approximation of asin is an odd polynomial: acos(x) ~ pi/2 - (x + x^3 P(x^2)). @@ -78,16 +61,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) const struct data *d = ptr_barrier (&data); float64x2_t ax = vabsq_f64 (x); - -#if WANT_SIMD_EXCEPT - /* A single comparison for One, Small and QNaN. */ - uint64x2_t special - = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), - v_u64 (Oneu - Small)); - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, x, AllMask); -#endif - uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5)); /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with @@ -129,7 +102,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) = 2 Q(|x|) , for 0.5 < x < 1.0 = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ float64x2_t y = vbslq_f64 (d->abs_mask, p, x); - uint64x2_t is_neg = vcltzq_f64 (x); float64x2_t off = vreinterpretq_f64_u64 ( vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi))); diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c index 4bdccf689e..a2183dae80 100644 --- a/sysdeps/aarch64/fpu/acosf_advsimd.c +++ b/sysdeps/aarch64/fpu/acosf_advsimd.c @@ -35,25 +35,10 @@ static const struct data #define AbsMask 0x7fffffff #define Half 0x3f000000 -#define One 0x3f800000 -#define Small 0x32800000 /* 2^-26. */ - -#if WANT_SIMD_EXCEPT -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) -{ - return v_call_f32 (acosf, x, y, special); -} -#endif /* Single-precision implementation of vector acos(x). - For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct - rounding. - If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following - approximation. - - For |x| in [Small, 0.5], use order 4 polynomial P such that the final + For |x| in [0, 0.5], use order 4 polynomial P such that the final approximation of asin is an odd polynomial: acos(x) ~ pi/2 - (x + x^3 P(x^2)). @@ -67,7 +52,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) The largest observed error in this region is 1.32 ulps, _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 - want 0x1.feb32ep-1. */ + want 0x1.feb32ep-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -75,14 +60,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x) uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); -#if WANT_SIMD_EXCEPT - /* A single comparison for One, Small and QNaN. */ - uint32x4_t special - = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); - if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, x, v_u32 (0xffffffff)); -#endif - float32x4_t ax = vreinterpretq_f32_u32 (ia); uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half)); diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c index d7dab02766..854a172bc6 100644 --- a/sysdeps/aarch64/fpu/acosh_advsimd.c +++ b/sysdeps/aarch64/fpu/acosh_advsimd.c @@ -47,12 +47,6 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) const struct data *d = ptr_barrier (&data); uint64x2_t special = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh); - float64x2_t special_arg = x; - -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u64 (special))) - x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); -#endif float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0)); float64x2_t y = vaddq_f64 (x, v_f64 (1.0)); @@ -61,6 +55,6 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) y = vaddq_f64 (xm1, y); if (__glibc_unlikely (v_any_u64 (special))) - return special_case (special_arg, y, special, &d->log1p_consts); + return special_case (x, y, special, &d->log1p_consts); return log1p_inline (y, &d->log1p_consts); } diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c index 61d41d156c..7c3f590723 100644 --- a/sysdeps/aarch64/fpu/acoshf_advsimd.c +++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c @@ -36,13 +36,8 @@ special_case (float32x4_t x, float32x4_t y, uint16x4_t special, return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); } -/* Vector approximation for single-precision acosh, based on log1p. Maximum - error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it - is 3.00 ULP: - _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4 - want 0x1.ef0a7cp-4. - With exceptions disabled, we can compute u with a shorter dependency chain, - which gives maximum error of 3.22 ULP: +/* Vector approximation for single-precision acosh, based on log1p. + The largest observed error is 3.22 ULP: _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5 want 0x1.fdcdd2p-5. */ @@ -52,19 +47,9 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x) uint32x4_t ix = vreinterpretq_u32_f32 (x); uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh); -#if WANT_SIMD_EXCEPT - /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use - only xm1 to calculate u, as operating on x will trigger invalid for NaN. - Widening sign-extend special predicate in order to mask with it. */ - uint32x4_t p - = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special))); - float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); - float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); -#else float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one)); float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one))); -#endif float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c index f74141c845..a0ede9011c 100644 --- a/sysdeps/aarch64/fpu/asin_advsimd.c +++ b/sysdeps/aarch64/fpu/asin_advsimd.c @@ -37,25 +37,8 @@ static const struct data .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), }; -#define AllMask v_u64 (0xffffffffffffffff) -#define One 0x3ff0000000000000 -#define Small 0x3e50000000000000 /* 2^-12. */ - -#if WANT_SIMD_EXCEPT -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) -{ - return v_call_f64 (asin, x, y, special); -} -#endif - /* Double-precision implementation of vector asin(x). - - For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct - rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the - following approximation. - - For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + For |x| in [0, 0.5], use an order 11 polynomial P such that the final approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). The largest observed error in this region is 1.01 ulps, @@ -72,19 +55,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - float64x2_t ax = vabsq_f64 (x); -#if WANT_SIMD_EXCEPT - /* Special values need to be computed with scalar fallbacks so - that appropriate exceptions are raised. */ - uint64x2_t special - = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), - v_u64 (One - Small)); - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, x, AllMask); -#endif - uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); /* Evaluate polynomial Q(x) = y + y * z * P(z) with diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c index 013936c2c0..abadca1f19 100644 --- a/sysdeps/aarch64/fpu/asinf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinf_advsimd.c @@ -33,21 +33,10 @@ static const struct data }; #define AbsMask 0x7fffffff -#define One 0x3f800000 -#define Small 0x39800000 /* 2^-12. */ - -#if WANT_SIMD_EXCEPT -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) -{ - return v_call_f32 (asinf, x, y, special); -} -#endif /* Single-precision implementation of vector asin(x). - - For |x| <0.5, use order 4 polynomial P such that the final + For |x| in [0, 0.5), use order 4 polynomial P such that the final approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). The largest observed error in this region is 0.83 ulps, @@ -65,16 +54,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); - -#if WANT_SIMD_EXCEPT - /* Special values need to be computed with scalar fallbacks so - that appropriate fp exceptions are raised. */ - uint32x4_t special - = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); - if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, x, v_u32 (0xffffffff)); -#endif - float32x4_t ax = vreinterpretq_f32_u32 (ia); uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f)); diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c index 1afc072fc3..5b0f2ac05f 100644 --- a/sysdeps/aarch64/fpu/asinh_advsimd.c +++ b/sysdeps/aarch64/fpu/asinh_advsimd.c @@ -23,54 +23,29 @@ const static struct data { uint64x2_t huge_bound, abs_mask, off, mask; -#if WANT_SIMD_EXCEPT - float64x2_t tiny_bound; -#endif float64x2_t lc0, lc2; double lc1, lc3, ln2, lc4; - float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; double c1, c3, c5, c7, c9, c11, c13, c15; - } data = { -#if WANT_SIMD_EXCEPT - .tiny_bound = V2 (0x1p-26), -#endif /* Even terms of polynomial s.t. asinh(x) is approximated by asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ - - .c0 = V2 (-0x1.55555555554a7p-3), - .c1 = 0x1.3333333326c7p-4, - .c2 = V2 (-0x1.6db6db68332e6p-5), - .c3 = 0x1.f1c71b26fb40dp-6, - .c4 = V2 (-0x1.6e8b8b654a621p-6), - .c5 = 0x1.1c4daa9e67871p-6, - .c6 = V2 (-0x1.c9871d10885afp-7), - .c7 = 0x1.7a16e8d9d2ecfp-7, - .c8 = V2 (-0x1.3ddca533e9f54p-7), - .c9 = 0x1.0becef748dafcp-7, - .c10 = V2 (-0x1.b90c7099dd397p-8), - .c11 = 0x1.541f2bb1ffe51p-8, - .c12 = V2 (-0x1.d217026a669ecp-9), - .c13 = 0x1.0b5c7977aaf7p-9, - .c14 = V2 (-0x1.e0f37daef9127p-11), - .c15 = 0x1.388b5fe542a6p-12, - .c16 = V2 (-0x1.021a48685e287p-14), - .c17 = V2 (0x1.93d4ba83d34dap-18), - - .lc0 = V2 (-0x1.ffffffffffff7p-2), - .lc1 = 0x1.55555555170d4p-2, - .lc2 = V2 (-0x1.0000000399c27p-2), - .lc3 = 0x1.999b2e90e94cap-3, - .lc4 = -0x1.554e550bd501ep-3, - .ln2 = 0x1.62e42fefa39efp-1, - - .off = V2 (0x3fe6900900000000), - .huge_bound = V2 (0x5fe0000000000000), - .abs_mask = V2 (0x7fffffffffffffff), - .mask = V2 (0xfffULL << 52), + .c0 = V2 (-0x1.55555555554a7p-3), .c1 = 0x1.3333333326c7p-4, + .c2 = V2 (-0x1.6db6db68332e6p-5), .c3 = 0x1.f1c71b26fb40dp-6, + .c4 = V2 (-0x1.6e8b8b654a621p-6), .c5 = 0x1.1c4daa9e67871p-6, + .c6 = V2 (-0x1.c9871d10885afp-7), .c7 = 0x1.7a16e8d9d2ecfp-7, + .c8 = V2 (-0x1.3ddca533e9f54p-7), .c9 = 0x1.0becef748dafcp-7, + .c10 = V2 (-0x1.b90c7099dd397p-8), .c11 = 0x1.541f2bb1ffe51p-8, + .c12 = V2 (-0x1.d217026a669ecp-9), .c13 = 0x1.0b5c7977aaf7p-9, + .c14 = V2 (-0x1.e0f37daef9127p-11), .c15 = 0x1.388b5fe542a6p-12, + .c16 = V2 (-0x1.021a48685e287p-14), .c17 = V2 (0x1.93d4ba83d34dap-18), + .lc0 = V2 (-0x1.ffffffffffff7p-2), .lc1 = 0x1.55555555170d4p-2, + .lc2 = V2 (-0x1.0000000399c27p-2), .lc3 = 0x1.999b2e90e94cap-3, + .lc4 = -0x1.554e550bd501ep-3, .ln2 = 0x1.62e42fefa39efp-1, + .off = V2 (0x3fe6900900000000), .huge_bound = V2 (0x5fe0000000000000), + .abs_mask = V2 (0x7fffffffffffffff), .mask = V2 (0xfffULL << 52), }; static float64x2_t NOINLINE VPCS_ATTR @@ -106,10 +81,9 @@ lookup (uint64x2_t i) } static inline float64x2_t -log_inline (float64x2_t xm, const struct data *d) +log_inline (float64x2_t ax, const struct data *d) { - - uint64x2_t u = vreinterpretq_u64_f64 (xm); + uint64x2_t u = vreinterpretq_u64_f64 (ax); uint64x2_t u_off = vsubq_u64 (u, d->off); int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); @@ -143,10 +117,10 @@ log_inline (float64x2_t xm, const struct data *d) asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise where log(x) is an optimized log approximation, and P(x) is a polynomial - shared with the scalar routine. The greatest observed error 2.79 ULP, in - |x| >= 1: - _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1 - want 0x1.ffffd003219ddp-1. */ + shared with the scalar routine. + For |x| >= 1, the greatest observed error is 2.87 ULP. + _ZGVnN2v_asinh(-0x1.177c6017ce58ap+0) got -0x1.e3ba3d5cb1a46p-1 + want -0x1.e3ba3d5cb1a49p-1. */ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -154,47 +128,26 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); -#if WANT_SIMD_EXCEPT - uint64x2_t iax = vreinterpretq_u64_f64 (ax); - uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound)); - uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); - special = vorrq_u64 (special, tiny); -#else uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound)); -#endif /* Option 1: |x| >= 1. - Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). - If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will - overflow, by setting special lanes to 1. These will be fixed later. */ + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */ float64x2_t option_1 = v_f64 (0); if (__glibc_likely (v_any_u64 (gt1))) { -#if WANT_SIMD_EXCEPT - float64x2_t xm = v_zerofy_f64 (ax, special); -#else - float64x2_t xm = ax; -#endif option_1 = log_inline ( - vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); + vaddq_f64 (ax, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), ax, ax))), d); } /* Option 2: |x| < 1. Compute asinh(x) using a polynomial. - If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will - overflow, and tiny lanes, which will underflow, by setting them to 0. They - will be fixed later, either by selecting x or falling back to the scalar - special-case. The largest observed error in this region is 1.47 ULPs: - _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 - want 0x1.c1d6bf874019cp-1. */ + The largest observed error in this region is 1.36 ULPs: + _ZGVnN2v_asinh(0x1.fe1e2aaa8dd54p-1) got 0x1.c1ee60bc0788ap-1 + want 0x1.c1ee60bc0788bp-1. */ float64x2_t option_2 = v_f64 (0); if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1)))) { - -#if WANT_SIMD_EXCEPT - ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); -#endif float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2); /* Order-17 Pairwise Horner scheme. */ float64x2_t c13 = vld1q_f64 (&d->c1); @@ -216,25 +169,18 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) p = vfmaq_f64 (p1213, z2, p); p = vfmaq_f64 (p1011, z2, p); p = vfmaq_f64 (p89, z2, p); - p = vfmaq_f64 (p67, z2, p); p = vfmaq_f64 (p45, z2, p); - p = vfmaq_f64 (p23, z2, p); - p = vfmaq_f64 (p01, z2, p); option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2)); -#if WANT_SIMD_EXCEPT - option_2 = vbslq_f64 (tiny, x, option_2); -#endif } /* Choose the right option for each lane. */ float64x2_t y = vbslq_f64 (gt1, option_1, option_2); if (__glibc_unlikely (v_any_u64 (special))) - { - return special_case (x, y, d->abs_mask, special); - } + return special_case (x, y, d->abs_mask, special); + /* Copy sign. */ return vbslq_f64 (d->abs_mask, y, x); } diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c index cff66e58c1..90b2d78b5e 100644 --- a/sysdeps/aarch64/fpu/asinhf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c @@ -25,16 +25,10 @@ const static struct data struct v_log1pf_data log1pf_consts; float32x4_t one; uint32x4_t big_bound; -#if WANT_SIMD_EXCEPT - uint32x4_t tiny_bound; -#endif } data = { .one = V4 (1), .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ -#if WANT_SIMD_EXCEPT - .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ -#endif }; static float32x4_t NOINLINE VPCS_ATTR @@ -59,18 +53,6 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x) uint32x4_t iax = vreinterpretq_u32_f32 (ax); uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax); - float32x4_t special_arg = x; - -#if WANT_SIMD_EXCEPT - /* Sidestep tiny and large values to avoid inadvertently triggering - under/overflow. */ - special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); - if (__glibc_unlikely (v_any_u32 (special))) - { - ax = v_zerofy_f32 (ax, special); - x = v_zerofy_f32 (x, special); - } -#endif /* asinh(x) = log(x + sqrt(x * x + 1)). For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ @@ -79,7 +61,7 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x) float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)); if (__glibc_unlikely (v_any_u32 (special))) - return special_case (special_arg, sign, y, special, dat); + return special_case (x, sign, y, special, dat); return vreinterpretq_f32_u32 (veorq_u32 ( sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts)))); } diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c index da0d3715df..72a4d03e8a 100644 --- a/sysdeps/aarch64/fpu/atan_advsimd.c +++ b/sysdeps/aarch64/fpu/atan_advsimd.c @@ -64,15 +64,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) uint64x2_t ix = vreinterpretq_u64_f64 (x); uint64x2_t sign = vandq_u64 (ix, SignMask); -#if WANT_SIMD_EXCEPT - uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000)); - uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)), - v_u64 (BigBound - TinyBound)); - /* If any lane is special, fall back to the scalar routine for all lanes. */ - if (__glibc_unlikely (v_any_u64 (special))) - return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1)); -#endif - /* Argument reduction: y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c index 817a47ef3e..c69c617b7d 100644 --- a/sysdeps/aarch64/fpu/atanf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanf_advsimd.c @@ -24,13 +24,6 @@ static const struct data { uint32x4_t sign_mask, pi_over_2; float32x4_t neg_one; -#if WANT_SIMD_EXCEPT - float32x4_t poly[8]; -} data = { - .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3), - V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5), - V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) }, -#else float32x4_t c0, c2, c4, c6; float c1, c3, c5, c7; } data = { @@ -41,23 +34,10 @@ static const struct data .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, -#endif - .pi_over_2 = V4 (0x3fc90fdb), - .neg_one = V4 (-1.0f), + .pi_over_2 = V4 (0x3fc90fdb), .neg_one = V4 (-1.0f), .sign_mask = V4 (0x80000000), }; -#if WANT_SIMD_EXCEPT -#define TinyBound 0x30800000 /* asuint(0x1p-30). */ -#define BigBound 0x4e800000 /* asuint(0x1p30). */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) -{ - return v_call_f32 (atanf, x, y, special); -} -#endif - /* Fast implementation of vector atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps: @@ -70,17 +50,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t sign = vandq_u32 (ix, d->sign_mask); -#if WANT_SIMD_EXCEPT - /* Small cases, infs and nans are supported by our approximation technique, - but do not set fenv flags correctly. Only trigger special case if we need - fenv. */ - uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); - uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), - v_u32 (BigBound - TinyBound)); - /* If any lane is special, fall back to the scalar routine for all lanes. */ - if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, x, v_u32 (-1)); -#endif /* Argument reduction: y := arctan(x) for |x| < 1 y := arctan(-1/x) + pi/2 for x > +1 @@ -97,18 +66,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) float32x4_t z2 = vmulq_f32 (z, z); float32x4_t z3 = vmulq_f32 (z, z2); float32x4_t z4 = vmulq_f32 (z2, z2); -#if WANT_SIMD_EXCEPT - - /* Calculate the polynomial approximation. - Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, - a standard implementation using z8 creates spurious underflow - in the very last fma (when z^8 is small enough). - Therefore, we split the last fma into a mul and an fma. */ - float32x4_t y = vfmaq_f32 ( - v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, - vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); - -#else float32x4_t z8 = vmulq_f32 (z4, z4); /* Uses an Estrin scheme for polynomial approximation. */ @@ -123,7 +80,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) float32x4_t p47 = vfmaq_f32 (p45, z4, p67); float32x4_t y = vfmaq_f32 (p03, z8, p47); -#endif /* y = shift + z * P(z^2). */ return vfmaq_f32 (vaddq_f32 (shift, z), z3, y); diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c index 7fba3094ca..446decb5f8 100644 --- a/sysdeps/aarch64/fpu/atanh_advsimd.c +++ b/sysdeps/aarch64/fpu/atanh_advsimd.c @@ -52,20 +52,12 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) uint64x2_t ia = vreinterpretq_u64_f64 (ax); uint64x2_t special = vcgeq_u64 (ia, d->one); -#if WANT_SIMD_EXCEPT - ax = v_zerofy_f64 (ax, special); -#endif - float64x2_t y; y = vaddq_f64 (ax, ax); y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); if (__glibc_unlikely (v_any_u64 (special))) -#if WANT_SIMD_EXCEPT - return special_case (x, halfsign, y, special, d); -#else return special_case (ax, halfsign, y, special, d); -#endif y = log1p_inline (y, &d->log1p_consts); return vmulq_f64 (y, halfsign); diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c index ef23b89485..93c41fbdc6 100644 --- a/sysdeps/aarch64/fpu/atanhf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c @@ -24,16 +24,9 @@ const static struct data { struct v_log1pf_data log1pf_consts; uint32x4_t one; -#if WANT_SIMD_EXCEPT - uint32x4_t tiny_bound; -#endif } data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000), -#if WANT_SIMD_EXCEPT - /* 0x1p-12, below which atanhf(x) rounds to x. */ - .tiny_bound = V4 (0x39800000), -#endif }; #define AbsMask v_u32 (0x7fffffff) @@ -59,16 +52,7 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) float32x4_t ax = vabsq_f32 (x); uint32x4_t iax = vreinterpretq_u32_f32 (ax); -#if WANT_SIMD_EXCEPT - uint32x4_t special - = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound)); - /* Side-step special cases by setting those lanes to 0, which will trigger no - exceptions. These will be fixed up later. */ - if (__glibc_unlikely (v_any_u32 (special))) - ax = v_zerofy_f32 (ax, special); -#else uint32x4_t special = vcgeq_u32 (iax, d->one); -#endif float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); @@ -78,11 +62,7 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) chain. If exceptions are required ax will have been zerofied, so have to pass x. */ if (__glibc_unlikely (v_any_u32 (special))) -#if WANT_SIMD_EXCEPT - return special_case (x, halfsign, y, special); -#else return special_case (ax, halfsign, y, special); -#endif return vmulq_f32 (halfsign, y); } libmvec_hidden_def (V_NAME_F1 (atanh)) diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c index 0b5b7af45e..58c2a1e003 100644 --- a/sysdeps/aarch64/fpu/cos_advsimd.c +++ b/sysdeps/aarch64/fpu/cos_advsimd.c @@ -51,27 +51,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; uint64x2_t odd, cmp; -#if WANT_SIMD_EXCEPT - r = vabsq_f64 (x); - cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), - vreinterpretq_u64_f64 (d->range_val)); - if (__glibc_unlikely (v_any_u64 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - special-case handler later. */ - r = vbslq_f64 (cmp, v_f64 (1.0), r); -#else cmp = vcageq_f64 (x, d->range_val); - r = x; -#endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); + n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), x, d->inv_pi)); odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); n = vsubq_f64 (n, v_f64 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (x, d->pi_1, n); r = vfmsq_f64 (r, d->pi_2, n); r = vfmsq_f64 (r, d->pi_3, n); diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c index 4d5167995f..39464101f0 100644 --- a/sysdeps/aarch64/fpu/cosf_advsimd.c +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c @@ -52,27 +52,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) float32x4_t n, r, r2, r3, y; uint32x4_t odd, cmp; -#if WANT_SIMD_EXCEPT - r = vabsq_f32 (x); - cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), - vreinterpretq_u32_f32 (d->range_val)); - if (__glibc_unlikely (v_any_u32 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - special-case handler later. */ - r = vbslq_f32 (cmp, v_f32 (1.0f), r); -#else cmp = vcageq_f32 (x, d->range_val); - r = x; -#endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); + n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), x, d->inv_pi)); odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); n = vsubq_f32 (n, v_f32 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (x, d->pi_1, n); r = vfmsq_f32 (r, d->pi_2, n); r = vfmsq_f32 (r, d->pi_3, n); diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c index 35e1562a6e..3c5c5c4ba5 100644 --- a/sysdeps/aarch64/fpu/coshf_advsimd.c +++ b/sysdeps/aarch64/fpu/coshf_advsimd.c @@ -23,70 +23,37 @@ static const struct data { struct v_expf_data expf_consts; - uint32x4_t tiny_bound; float32x4_t bound; -#if WANT_SIMD_EXCEPT - uint32x4_t special_bound; -#endif } data = { .expf_consts = V_EXPF_DATA, - .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ .bound = V4 (0x1.5a92d8p+6), -#if WANT_SIMD_EXCEPT - .special_bound = V4 (0x42ad496c), -#endif }; -#if !WANT_SIMD_EXCEPT static float32x4_t NOINLINE VPCS_ATTR special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, uint32x4_t special) { return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); } -#endif /* Single-precision vector cosh, using vector expf. Maximum error is 2.38 ULP: _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4. */ -float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to the scalar - variant for all inputs if any input is a special value or above the bound - at which expf overflows. */ - float32x4_t ax = vabsq_f32 (x); - uint32x4_t iax = vreinterpretq_u32_f32 (ax); - uint32x4_t special = vcgeq_u32 (iax, d->special_bound); - if (__glibc_unlikely (v_any_u32 (special))) - return v_call_f32 (coshf, x, x, v_u32 (-1)); - - uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound); - /* If any input is tiny, avoid underflow exception by fixing tiny lanes of - input to 0, which will generate no exceptions. */ - if (__glibc_unlikely (v_any_u32 (tiny))) - ax = v_zerofy_f32 (ax, tiny); - float32x4_t t = v_expf_inline (ax, &d->expf_consts); -#else uint32x4_t special = vcageq_f32 (x, d->bound); float32x4_t t = v_expf_inline (x, &d->expf_consts); -#endif /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ float32x4_t half_t = vmulq_n_f32 (t, 0.5); float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u32 (tiny))) - return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); -#else if (__glibc_unlikely (v_any_u32 (special))) return special_case (x, half_t, half_over_t, special); -#endif return vaddq_f32 (half_t, half_over_t); } diff --git a/sysdeps/aarch64/fpu/cospi_advsimd.c b/sysdeps/aarch64/fpu/cospi_advsimd.c index dcd12c8f89..5d03cca539 100644 --- a/sysdeps/aarch64/fpu/cospi_advsimd.c +++ b/sysdeps/aarch64/fpu/cospi_advsimd.c @@ -51,24 +51,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - float64x2_t r = vabsq_f64 (x); - uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x); - - /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd - to avoid them overflowing and throwing exceptions. */ - r = v_zerofy_f64 (r, cmp); - uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63); - -#else - float64x2_t r = x; - uint64x2_t cmp = vcageq_f64 (r, d->range_val); + uint64x2_t cmp = vcageq_f64 (x, d->range_val); uint64x2_t odd - = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); - -#endif + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), 63); - r = vsubq_f64 (r, vrndaq_f64 (r)); + float64x2_t r = vsubq_f64 (x, vrndaq_f64 (x)); /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r)); diff --git a/sysdeps/aarch64/fpu/cospif_advsimd.c b/sysdeps/aarch64/fpu/cospif_advsimd.c index a81471f408..155ff0e1b6 100644 --- a/sysdeps/aarch64/fpu/cospif_advsimd.c +++ b/sysdeps/aarch64/fpu/cospif_advsimd.c @@ -42,30 +42,16 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) Maximum Error: 3.17 ULP: _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - float32x4_t r = vabsq_f32 (x); - uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x); - - /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd - to avoid them overflowing and throwing exceptions. */ - r = v_zerofy_f32 (r, cmp); - uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31); - -#else - float32x4_t r = x; - uint32x4_t cmp = vcageq_f32 (r, d->range_val); - + uint32x4_t cmp = vcageq_f32 (x, d->range_val); uint32x4_t odd - = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); - -#endif + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 31); /* r = x - rint(x). */ - r = vsubq_f32 (r, vrndaq_f32 (r)); + float32x4_t r = vsubq_f32 (x, vrndaq_f32 (x)); /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r)); @@ -82,6 +68,5 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) /* Reintroduce the sign bit for inputs which round to odd. */ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } - libmvec_hidden_def (V_NAME_F1 (cospi)) HALF_WIDTH_ALIAS_F1 (cospi) diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c index ec1ad0bcd0..fe7c03ed9d 100644 --- a/sysdeps/aarch64/fpu/erf_advsimd.c +++ b/sysdeps/aarch64/fpu/erf_advsimd.c @@ -23,12 +23,9 @@ static const struct data { float64x2_t third; float64x2_t tenth, two_over_five, two_over_nine; - double two_over_fifteen, two_over_fortyfive; + double two_over_fifteen; float64x2_t max, shift; uint64x2_t max_idx; -#if WANT_SIMD_EXCEPT - float64x2_t tiny_bound, huge_bound, scale_minus_one; -#endif } data = { .max_idx = V2 (768), .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ @@ -36,14 +33,8 @@ static const struct data .tenth = V2 (-0x1.999999999999ap-4), .two_over_five = V2 (-0x1.999999999999ap-2), .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), - .two_over_fortyfive = 0x1.6c16c16c16c17p-5, .max = V2 (5.9921875), /* 6 - 1/128. */ .shift = V2 (0x1p45), -#if WANT_SIMD_EXCEPT - .huge_bound = V2 (0x1p205), - .tiny_bound = V2 (0x1p-226), - .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */ -#endif }; #define AbsMask 0x7fffffffffffffff @@ -92,22 +83,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) uint64x2_t a_le_max = vcaleq_f64 (x, dat->max); uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max); -#if WANT_SIMD_EXCEPT - /* |x| huge or tiny. */ - uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound); - uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound); - uint64x2_t cmp = vorrq_u64 (cmp1, cmp2); - /* If any lanes are special, mask them with 1 for small x or 8 for large - values and retain a copy of a to allow special case handler to fix special - lanes later. This is only necessary if fenv exceptions are to be triggered - correctly. */ - if (__glibc_unlikely (v_any_u64 (cmp))) - { - a = vbslq_f64 (cmp1, v_f64 (8.0), a); - a = vbslq_f64 (cmp2, v_f64 (1.0), a); - } -#endif - /* Set r to multiple of 1/128 nearest to |x|. */ float64x2_t shift = dat->shift; float64x2_t z = vaddq_f64 (a, shift); @@ -153,16 +128,5 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) y = vbslq_f64 (a_gt_max, v_f64 (1.0), y); /* Copy sign. */ - y = vbslq_f64 (v_u64 (AbsMask), y, x); - -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u64 (cmp2))) - { - /* Neutralise huge values of x before fixing small values. */ - x = vbslq_f64 (cmp1, v_f64 (1.0), x); - /* Fix tiny values that trigger spurious underflow. */ - return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y); - } -#endif - return y; + return vbslq_f64 (v_u64 (AbsMask), y, x); } diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c index 6b0fadb658..2b80e06d9d 100644 --- a/sysdeps/aarch64/fpu/erfc_advsimd.c +++ b/sysdeps/aarch64/fpu/erfc_advsimd.c @@ -27,9 +27,6 @@ static const struct data float64x2_t p20, p40, p41, p51; double p42, p52; double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; -#if WANT_SIMD_EXCEPT - float64x2_t uflow_bound; -#endif } data = { /* Set an offset so the range of the index used for lookup is 3487, and it can be clamped using a saturated add on an offset index. @@ -50,12 +47,8 @@ static const struct data .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 }, .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 }, .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 }, -#if WANT_SIMD_EXCEPT - .uflow_bound = V2 (0x1.a8b12fc6e4892p+4), -#endif }; -#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */ #define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */ struct entry @@ -77,14 +70,6 @@ lookup (uint64x2_t i) return e; } -#if WANT_SIMD_EXCEPT -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) -{ - return v_call_f64 (erfc, x, y, cmp); -} -#endif - /* Optimized double-precision vector erfc(x). Approximation based on series expansion near x rounded to nearest multiple of 1/128. @@ -116,23 +101,6 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) { const struct data *dat = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */ - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound)); - /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in - integer domain to avoid raising exceptions in presence of nans. */ - uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x), - vreinterpretq_s64_f64 (dat->uflow_bound)); - cmp = vorrq_u64 (cmp, uflow); - float64x2_t xm = x; - /* If any lanes are special, mask them with 0 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (__glibc_unlikely (v_any_u64 (cmp))) - x = v_zerofy_f64 (x, cmp); -#endif - float64x2_t a = vabsq_f64 (x); a = vminq_f64 (a, dat->max); @@ -198,10 +166,5 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) float64x2_t fac = vreinterpretq_f64_u64 ( vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1)); -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u64 (cmp))) - return special_case (xm, vfmaq_f64 (off, fac, y), cmp); -#endif - return vfmaq_f64 (off, fac, y); } diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c index c4690042db..db2d217d7f 100644 --- a/sysdeps/aarch64/fpu/erfcf_advsimd.c +++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c @@ -25,10 +25,6 @@ static const struct data float32x4_t max, shift; float coeffs[4]; float32x4_t third, two_over_five, tenth; -#if WANT_SIMD_EXCEPT - float32x4_t uflow_bound; -#endif - } data = { /* Set an offset so the range of the index used for lookup is 644, and it can be clamped using a saturated add. */ @@ -42,13 +38,8 @@ static const struct data .third = V4 (0x1.555556p-2f), .two_over_five = V4 (-0x1.99999ap-2f), .tenth = V4 (-0x1.99999ap-4f), -#if WANT_SIMD_EXCEPT - .uflow_bound = V4 (0x1.2639cp+3f), -#endif }; -#define TinyBound 0x41000000 /* 0x1p-62f << 1. */ -#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ #define Off 0xfffffd7b /* 0xffffffff - 644. */ struct entry @@ -76,14 +67,6 @@ lookup (uint32x4_t i) return e; } -#if WANT_SIMD_EXCEPT -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) -{ - return v_call_f32 (erfcf, x, y, cmp); -} -#endif - /* Optimized single-precision vector erfcf(x). Approximation based on series expansion near x rounded to nearest multiple of 1/64. @@ -106,23 +89,6 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x) { const struct data *dat = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ - uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); - /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in - integer domain to avoid raising exceptions in presence of nans. */ - uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), - vreinterpretq_s32_f32 (dat->uflow_bound)); - cmp = vorrq_u32 (cmp, uflow); - float32x4_t xm = x; - /* If any lanes are special, mask them with 0 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (__glibc_unlikely (v_any_u32 (cmp))) - x = v_zerofy_f32 (x, cmp); -#endif - float32x4_t a = vabsq_f32 (x); a = vminq_f32 (a, dat->max); @@ -165,11 +131,6 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x) float32x4_t fac = vreinterpretq_f32_u32 ( vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u32 (cmp))) - return special_case (xm, vfmaq_f32 (off, fac, y), cmp); -#endif - return vfmaq_f32 (off, fac, y); } libmvec_hidden_def (V_NAME_F1 (erfc)) diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c index 72e49a7269..2ecce8c1c1 100644 --- a/sysdeps/aarch64/fpu/erff_advsimd.c +++ b/sysdeps/aarch64/fpu/erff_advsimd.c @@ -22,17 +22,10 @@ static const struct data { float32x4_t max, shift, third; -#if WANT_SIMD_EXCEPT - float32x4_t tiny_bound, scale_minus_one; -#endif } data = { .max = V4 (3.9375), /* 4 - 8/128. */ .shift = V4 (0x1p16f), .third = V4 (0x1.555556p-2f), /* 1/3. */ -#if WANT_SIMD_EXCEPT - .tiny_bound = V4 (0x1p-62f), - .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */ -#endif }; #define AbsMask 0x7fffffff @@ -75,17 +68,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x) { const struct data *dat = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - /* |x| < 2^-62. */ - uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (__glibc_unlikely (v_any_u32 (cmp))) - x = vbslq_f32 (cmp, v_f32 (1), x); -#endif - float32x4_t a = vabsq_f32 (x); uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max); @@ -111,13 +93,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x) y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y); /* Copy sign. */ - y = vbslq_f32 (v_u32 (AbsMask), y, x); - -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u32 (cmp))) - return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y); -#endif - return y; + return vbslq_f32 (v_u32 (AbsMask), y, x); } libmvec_hidden_def (V_NAME_F1 (erf)) HALF_WIDTH_ALIAS_F1 (erf) diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c index 141778749b..c08daf092a 100644 --- a/sysdeps/aarch64/fpu/exp10_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c @@ -28,9 +28,7 @@ const static struct data { float64x2_t poly[4]; float64x2_t log10_2, log2_10_hi, log2_10_lo, shift; -#if !WANT_SIMD_EXCEPT float64x2_t special_bound, scale_thresh; -#endif } data = { /* Coefficients generated using Remez algorithm. rel error: 0x1.5ddf8f28p-54 @@ -42,31 +40,13 @@ const static struct data .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */ .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66), .shift = V2 (0x1.8p+52), -#if !WANT_SIMD_EXCEPT .scale_thresh = V2 (ScaleBound), .special_bound = V2 (SpecialBound), -#endif }; #define N (1 << V_EXP_TABLE_BITS) #define IndexMask v_u64 (N - 1) -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ -# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ -# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine for special lanes. */ - return v_call_f64 (exp10, x, y, cmp); -} - -#else - # define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ @@ -87,8 +67,6 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n, return vbslq_f64 (cmp, r1, r0); } -#endif - /* Fast vector implementation of exp10. Maximum measured error is 1.64 ulp. _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5 @@ -96,19 +74,7 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n, float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - uint64x2_t cmp; -#if WANT_SIMD_EXCEPT - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special_case to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - float64x2_t xm = x; - uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); - cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres); - if (__glibc_unlikely (v_any_u64 (cmp))) - x = vbslq_f64 (cmp, v_f64 (1), x); -#else - cmp = vcageq_f64 (x, d->special_bound); -#endif + uint64x2_t cmp = vcageq_f64 (x, d->special_bound); /* n = round(x/(log10(2)/N)). */ float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2); @@ -135,11 +101,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); if (__glibc_unlikely (v_any_u64 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f64 (s, y, s), cmp); -#else return special_case (s, y, n, d); -#endif return vfmaq_f64 (s, y, s); } diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c index 47726ede24..4eab670171 100644 --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c @@ -27,9 +27,7 @@ static const struct data float log10_2_high, log10_2_low, c2, c4; float32x4_t inv_log10_2, special_bound; uint32x4_t exponent_bias, special_offset, special_bias; -#if !WANT_SIMD_EXCEPT float32x4_t scale_thresh; -#endif } data = { /* Coefficients generated using Remez algorithm with minimisation of relative error. @@ -49,28 +47,9 @@ static const struct data .exponent_bias = V4 (0x3f800000), .special_offset = V4 (0x82000000), .special_bias = V4 (0x7f000000), -#if !WANT_SIMD_EXCEPT .scale_thresh = V4 (ScaleBound) -#endif }; -#if WANT_SIMD_EXCEPT - -# define SpecialBound 38.0f /* rint(log10(2^127)). */ -# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ -# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ -# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (exp10f, x, y, cmp); -} - -#else - # define SpecialBound 126.0f static float32x4_t VPCS_ATTR NOINLINE @@ -90,8 +69,6 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, return vbslq_f32 (cmp2, r2, r); } -#endif - /* Fast vector implementation of single-precision exp10. Algorithm is accurate to 2.36 ULP. _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 @@ -99,17 +76,6 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - uint32x4_t cmp = vcgeq_u32 ( - vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (__glibc_unlikely (v_any_u32 (cmp))) - x = v_zerofy_f32 (x, cmp); -#endif /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and @@ -122,9 +88,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); -#if !WANT_SIMD_EXCEPT uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); -#endif float32x4_t r2 = vmulq_f32 (r, r); float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); @@ -133,12 +97,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); if (__glibc_unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); -#else return special_case (poly, n, e, cmp, scale, d); -#endif - return vfmaq_f32 (scale, poly, scale); } libmvec_hidden_def (V_NAME_F1 (exp10)) diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c index 71affe8f5c..a7c1222c5e 100644 --- a/sysdeps/aarch64/fpu/exp2_advsimd.c +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c @@ -47,19 +47,6 @@ lookup_sbits (uint64x2_t i) __v_exp_data[i[1] & IndexMask] }; } -#if WANT_SIMD_EXCEPT - -# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ - -/* Call scalar exp2 as a fallback. */ -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) -{ - return v_call_f64 (exp2, x, y, is_special); -} - -#else - # define SpecialOffset 0x6000000000000000 /* 0x1p513. */ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ # define SpecialBias1 0x7000000000000000 /* 0x1p769. */ @@ -80,8 +67,6 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n, return vbslq_f64 (cmp, r1, r0); } -#endif - /* Fast vector implementation of exp2. Maximum measured error is 1.65 ulp. _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 @@ -90,17 +75,7 @@ VPCS_ATTR float64x2_t V_NAME_D1 (exp2) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - uint64x2_t cmp; -#if WANT_SIMD_EXCEPT - uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); - cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); - /* Mask special lanes and retain a copy of x for passing to special-case - handler. */ - float64x2_t xc = x; - x = v_zerofy_f64 (x, cmp); -#else - cmp = vcagtq_f64 (x, d->scale_big_bound); -#endif + uint64x2_t cmp = vcagtq_f64 (x, d->scale_big_bound); /* n = round(x/N). */ float64x2_t z = vaddq_f64 (d->shift, x); @@ -121,10 +96,6 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) y = vmulq_f64 (r, y); if (__glibc_unlikely (v_any_u64 (cmp))) -#if !WANT_SIMD_EXCEPT return special_case (s, y, n, d); -#else - return special_case (xc, vfmaq_f64 (s, s, y), cmp); -#endif return vfmaq_f64 (s, s, y); } diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c index 7cbce0dd69..69863b19c7 100644 --- a/sysdeps/aarch64/fpu/exp2f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c @@ -23,9 +23,7 @@ static const struct data { float32x4_t c1, c3; uint32x4_t exponent_bias, special_offset, special_bias; -#if !WANT_SIMD_EXCEPT float32x4_t scale_thresh, special_bound; -#endif float c0, c2, c4, zero; } data = { /* maxerr: 1.962 ulp. */ @@ -37,28 +35,10 @@ static const struct data .exponent_bias = V4 (0x3f800000), .special_offset = V4 (0x82000000), .special_bias = V4 (0x7f000000), -#if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), -#endif }; -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ -# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ -# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine for special lanes. */ - return v_call_f32 (exp2f, x, y, cmp); -} - -#else - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) @@ -76,24 +56,10 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, return vbslq_f32 (cmp2, r2, r); } -#endif - float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ - uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special_case to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - if (__glibc_unlikely (v_any_u32 (cmp))) - x = vbslq_f32 (cmp, v_f32 (1), x); -#endif - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = n + r, with r in [-1/2, 1/2]. */ float32x4_t n = vrndaq_f32 (x); @@ -101,9 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); -#if !WANT_SIMD_EXCEPT uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); -#endif float32x4_t c024 = vld1q_f32 (&d->c0); float32x4_t r2 = vmulq_f32 (r, r); @@ -114,11 +78,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) float32x4_t poly = vfmaq_f32 (p, q, r2); if (__glibc_unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); -#else return special_case (poly, n, e, cmp, scale, d); -#endif return vfmaq_f32 (scale, poly, scale); } diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c index bd97012237..825c008abc 100644 --- a/sysdeps/aarch64/fpu/exp_advsimd.c +++ b/sysdeps/aarch64/fpu/exp_advsimd.c @@ -26,19 +26,15 @@ const static volatile struct { float64x2_t poly[3]; float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; -#if !WANT_SIMD_EXCEPT float64x2_t special_bound, scale_thresh; -#endif } data = { /* maxerr: 1.88 +0.5 ulp rel error: 1.4337*2^-53 abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), V2 (0x1.55555da646206p-5) }, -#if !WANT_SIMD_EXCEPT .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ .special_bound = V2 (704.0), -#endif .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), @@ -47,23 +43,6 @@ const static volatile struct #define C(i) data.poly[i] #define Tab __v_exp_data - -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ -# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ -# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f64 (exp, x, y, cmp); -} - -#else - # define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ @@ -83,25 +62,12 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n) return vbslq_f64 (cmp, r1, r0); } -#endif - float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) { float64x2_t n, r, r2, s, y, z; uint64x2_t cmp, u, e; -#if WANT_SIMD_EXCEPT - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special_case to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - float64x2_t xm = x; - uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); - cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); - if (__glibc_unlikely (v_any_u64 (cmp))) - x = vbslq_f64 (cmp, v_f64 (1), x); -#else cmp = vcagtq_f64 (x, data.special_bound); -#endif /* n = round(x/(ln2/N)). */ z = vfmaq_f64 (data.shift, x, data.inv_ln2); @@ -109,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) n = vsubq_f64 (z, data.shift); /* r = x - n*ln2/N. */ - r = x; - r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (x, data.ln2_hi, n); r = vfmsq_f64 (r, data.ln2_lo, n); e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); @@ -126,11 +91,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); if (__glibc_unlikely (v_any_u64 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f64 (s, y, s), cmp); -#else return special_case (s, y, n); -#endif return vfmaq_f64 (s, y, s); } diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c index f1e93b8490..c76dfe6517 100644 --- a/sysdeps/aarch64/fpu/expf_advsimd.c +++ b/sysdeps/aarch64/fpu/expf_advsimd.c @@ -24,9 +24,7 @@ static const struct data float32x4_t c1, c3, c4, inv_ln2; float ln2_hi, ln2_lo, c0, c2; uint32x4_t exponent_bias, special_offset, special_bias; -#if !WANT_SIMD_EXCEPT float32x4_t special_bound, scale_thresh; -#endif } data = { /* maxerr: 1.45358 +0.5 ulp. */ .c0 = 0x1.0e4020p-7f, @@ -40,30 +38,12 @@ static const struct data .exponent_bias = V4 (0x3f800000), .special_offset = V4 (0x82000000), .special_bias = V4 (0x7f000000), -#if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), -#endif }; #define C(i) d->poly[i] -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ -# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ -# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (expf, x, y, cmp); -} - -#else - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) @@ -82,27 +62,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, return vbslq_f32 (cmp2, r2, r); } -#endif - float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) { const struct data *d = ptr_barrier (&data); float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); -#if WANT_SIMD_EXCEPT - /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - uint32x4_t cmp = vcgeq_u32 ( - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), - TinyBound), - SpecialBound); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (__glibc_unlikely (v_any_u32 (cmp))) - x = vbslq_f32 (cmp, v_f32 (1), x); -#endif - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); @@ -111,9 +75,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); -#if !WANT_SIMD_EXCEPT uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); -#endif float32x4_t r2 = vmulq_f32 (r, r); float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); @@ -123,11 +85,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) float32x4_t poly = vfmaq_f32 (p, q, r2); if (__glibc_unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); -#else return special_case (poly, n, e, cmp, scale, d); -#endif return vfmaq_f32 (scale, poly, scale); } diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c index 01e7292bd5..eefa7abdd2 100644 --- a/sysdeps/aarch64/fpu/expm1_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c @@ -23,55 +23,31 @@ static const struct data { struct v_expm1_data d; -#if WANT_SIMD_EXCEPT - uint64x2_t thresh, tiny_bound; -#else float64x2_t oflow_bound; -#endif } data = { .d = V_EXPM1_DATA, -#if WANT_SIMD_EXCEPT - /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs - compare. */ - .thresh = V2 (0x78c56fa6d34b552), - /* asuint64(0x1p-51) << 1. */ - .tiny_bound = V2 (0x3cc0000000000000 << 1), -#else /* Value above which expm1(x) should overflow. Absolute value of the underflow bound is greater than this, so it catches both cases - there is a small window where fallbacks are triggered unnecessarily. */ .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), -#endif }; static float64x2_t VPCS_ATTR NOINLINE special_case (float64x2_t x, uint64x2_t special, const struct data *d) { - return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d), - special); + return v_call_f64 (expm1, x, expm1_inline (x, &d->d), special); } /* Double-precision vector exp(x) - 1 function. The maximum error observed error is 2.05 ULP: - _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2 - want 0x1.a8b636e2a9386p-2. */ + _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2 + want 0x1.a8897eef87b32p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - uint64x2_t ix = vreinterpretq_u64_f64 (x); - /* If fp exceptions are to be triggered correctly, fall back to scalar for - |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for - shift-left by 1, and compare with thresh which was left-shifted offline - - this is effectively an absolute compare. */ - uint64x2_t special - = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); -#else /* Large input, NaNs and Infs. */ uint64x2_t special = vcageq_f64 (x, d->oflow_bound); -#endif - if (__glibc_unlikely (v_any_u64 (special))) return special_case (x, special, d); diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c index a5712cf24b..37141c80bb 100644 --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c @@ -23,33 +23,19 @@ static const struct data { struct v_expm1f_data d; -#if WANT_SIMD_EXCEPT - uint32x4_t thresh; -#else float32x4_t oflow_bound; -#endif } data = { .d = V_EXPM1F_DATA, -#if !WANT_SIMD_EXCEPT /* Value above which expm1f(x) should overflow. Absolute value of the underflow bound is greater than this, so it catches both cases - there is a small window where fallbacks are triggered unnecessarily. */ .oflow_bound = V4 (0x1.5ebc4p+6), -#else - /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute - compare. */ - .thresh = V4 (0x1d5ebc40), -#endif }; -/* asuint(0x1p-23), shifted by 1 for abs compare. */ -#define TinyBound v_u32 (0x34000000 << 1) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t x, uint32x4_t special, const struct data *d) { - return v_call_f32 ( - expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special); + return v_call_f32 (expm1f, x, expm1f_inline (x, &d->d), special); } /* Single-precision vector exp(x) - 1 function. @@ -60,18 +46,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - uint32x4_t ix = vreinterpretq_u32_f32 (x); - /* If fp exceptions are to be triggered correctly, fall back to scalar for - |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for - shift-left by 1, and compare with thresh which was left-shifted offline - - this is effectively an absolute compare. */ - uint32x4_t special - = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); -#else /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); -#endif if (__glibc_unlikely (v_any_u32 (special))) return special_case (x, special, d); diff --git a/sysdeps/aarch64/fpu/finite_pow.h b/sysdeps/aarch64/fpu/finite_pow.h index bbe6d2dbd8..9fd535e77d 100644 --- a/sysdeps/aarch64/fpu/finite_pow.h +++ b/sysdeps/aarch64/fpu/finite_pow.h @@ -127,27 +127,6 @@ special_case (double tmp, uint64_t sbits, uint64_t ki) /* Note: sbits is signed scale. */ scale = asdouble (sbits); y = scale + scale * tmp; -#if WANT_SIMD_EXCEPT - if (fabs (y) < 1.0) - { - /* Round y to the right precision before scaling it into the subnormal - range to avoid double rounding that can cause 0.5+E/2 ulp error where - E is the worst-case ulp error outside the subnormal range. So this - is only useful if the goal is better than 1 ulp worst-case error. */ - double hi, lo, one = 1.0; - if (y < 0.0) - one = -1.0; - lo = scale - y + scale * tmp; - hi = one + y; - lo = one - hi + y + lo; - y = (hi + lo) - one; - /* Fix the sign of 0. */ - if (y == 0.0) - y = asdouble (sbits & 0x8000000000000000); - /* The underflow exception needs to be signaled explicitly. */ - force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); - } -#endif y = 0x1p-1022 * y; return y; } @@ -170,13 +149,8 @@ exp_inline (double x, double xtail, uint32_t sign_bias) { /* Note: inf and nan are already handled. */ /* Skip errno handling. */ -#if WANT_SIMD_EXCEPT - return asuint64 (x) >> 63 ? __math_uflow (sign_bias) - : __math_oflow (sign_bias); -#else double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY; return sign_bias ? -res_uoflow : res_uoflow; -#endif } /* Large x is special cased below. */ abstop = 0; @@ -221,11 +195,7 @@ exp_nosignbias (double x, double xtail) return 1.0; /* Note: inf and nan are already handled. */ if (abstop >= top12 (1024.0)) -#if WANT_SIMD_EXCEPT - return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0); -#else return asuint64 (x) >> 63 ? 0.0 : INFINITY; -#endif /* Large x is special cased below. */ abstop = 0; } @@ -319,10 +289,6 @@ pow_scalar_special_case (double x, double y) x2 = -x2; sign_bias = 1; } -#if WANT_SIMD_EXCEPT - if (2 * ix == 0 && iy >> 63) - return __math_divzero (sign_bias); -#endif return iy >> 63 ? 1 / x2 : x2; } /* Here x and y are non-zero finite. */ @@ -331,11 +297,7 @@ pow_scalar_special_case (double x, double y) /* Finite x < 0. */ int yint = checkint (iy); if (yint == 0) -#if WANT_SIMD_EXCEPT - return __math_invalid (x); -#else return __builtin_nan (""); -#endif if (yint == 1) sign_bias = SignBias; ix &= 0x7fffffffffffffff; @@ -349,12 +311,7 @@ pow_scalar_special_case (double x, double y) /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ if ((topy & 0x7ff) < SmallPowY) return 1.0; -#if WANT_SIMD_EXCEPT - return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0) - : __math_uflow (0); -#else return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0; -#endif } if (topx == 0) { diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c index 9af5402e4a..9f802d1e53 100644 --- a/sysdeps/aarch64/fpu/hypot_advsimd.c +++ b/sysdeps/aarch64/fpu/hypot_advsimd.c @@ -19,24 +19,14 @@ #include "v_math.h" -#if WANT_SIMD_EXCEPT -static const struct data -{ - uint64x2_t tiny_bound, thres; -} data = { - .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ - .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ -}; -#else static const struct data { uint64x2_t tiny_bound; uint32x4_t thres; } data = { .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ - .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ }; -#endif static float64x2_t VPCS_ATTR NOINLINE special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, @@ -50,35 +40,6 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222) got 0x1.6a1b19400964ep-204 want 0x1.6a1b19400964dp-204. */ -#if WANT_SIMD_EXCEPT - -float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) -{ - const struct data *d = ptr_barrier (&data); - - float64x2_t ax = vabsq_f64 (x); - float64x2_t ay = vabsq_f64 (y); - - uint64x2_t ix = vreinterpretq_u64_f64 (ax); - uint64x2_t iy = vreinterpretq_u64_f64 (ay); - - /* Extreme values, NaNs, and infinities should be handled by the scalar - fallback for correct flag handling. */ - uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres); - uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres); - ax = v_zerofy_f64 (ax, specialx); - ay = v_zerofy_f64 (ay, specialy); - uint32x2_t special = vaddhn_u64 (specialx, specialy); - - float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay); - - if (__glibc_unlikely (v_any_u32h (special))) - return special_case (x, y, sqsum, special); - - return vsqrtq_f64 (sqsum); -} -#else - float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) { const struct data *d = ptr_barrier (&data); @@ -94,4 +55,3 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) return vsqrtq_f64 (sqsum); } -#endif diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c index e2206f0e7c..feeba891b9 100644 --- a/sysdeps/aarch64/fpu/hypotf_advsimd.c +++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c @@ -19,15 +19,6 @@ #include "v_math.h" -#if WANT_SIMD_EXCEPT -static const struct data -{ - uint32x4_t tiny_bound, thres; -} data = { - .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ - .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ -}; -#else static const struct data { uint32x4_t tiny_bound; @@ -36,7 +27,6 @@ static const struct data .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ }; -#endif static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, @@ -49,36 +39,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, Maximum error observed is 1.21 ULP: _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 want 0x1.6a41dp-13. */ -#if WANT_SIMD_EXCEPT - -float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) -{ - const struct data *d = ptr_barrier (&data); - - float32x4_t ax = vabsq_f32 (x); - float32x4_t ay = vabsq_f32 (y); - - uint32x4_t ix = vreinterpretq_u32_f32 (ax); - uint32x4_t iy = vreinterpretq_u32_f32 (ay); - - /* Extreme values, NaNs, and infinities should be handled by the scalar - fallback for correct flag handling. */ - uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); - uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); - ax = v_zerofy_f32 (ax, specialx); - ay = v_zerofy_f32 (ay, specialy); - uint16x4_t special = vaddhn_u32 (specialx, specialy); - - float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); - - if (__glibc_unlikely (v_any_u16h (special))) - return special_case (x, y, sqsum, special); - - return vsqrtq_f32 (sqsum); -} -#else - -float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) { const struct data *d = ptr_barrier (&data); @@ -93,6 +54,5 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) return vsqrtq_f32 (sqsum); } -#endif libmvec_hidden_def (V_NAME_F2 (hypot)) HALF_WIDTH_ALIAS_F2(hypot) diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c index 44474be5e2..9bf11ffb2a 100644 --- a/sysdeps/aarch64/fpu/log1p_advsimd.c +++ b/sysdeps/aarch64/fpu/log1p_advsimd.c @@ -35,8 +35,7 @@ special_case (float64x2_t x, uint64x2_t cmp, const struct data *d) { /* Side-step special lanes so fenv exceptions are not triggered inadvertently. */ - float64x2_t x_nospecial = v_zerofy_f64 (x, cmp); - return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp); + return v_call_f64 (log1p, x, log1p_inline (x, &d->d), cmp); } /* Vector log1p approximation using polynomial on reduced interval. Routine is diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c index 887c504777..79e33e4aae 100644 --- a/sysdeps/aarch64/fpu/log1pf_advsimd.c +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c @@ -20,52 +20,6 @@ #include "v_math.h" #include "v_log1pf_inline.h" -#if WANT_SIMD_EXCEPT - -const static struct data -{ - uint32x4_t minus_one, thresh; - struct v_log1pf_data d; -} data = { - .d = V_LOG1PF_CONSTANTS_TABLE, - .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */ - .minus_one = V4 (0xbf800000), -}; - -/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ -# define TinyBound v_u32 (0x34000000) - -static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, uint32x4_t cmp, const struct data *d) -{ - /* Side-step special lanes so fenv exceptions are not triggered - inadvertently. */ - float32x4_t x_nospecial = v_zerofy_f32 (x, cmp); - return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp); -} - -/* Vector log1pf approximation using polynomial on reduced interval. Worst-case - error is 1.69 ULP: - _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3 - want 0x1.cfcbdcp-3. */ -VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - - uint32x4_t special_cases - = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh), - vcgeq_u32 (ix, d->minus_one)); - - if (__glibc_unlikely (v_any_u32 (special_cases))) - return special_case (x, special_cases, d); - - return log1pf_inline (x, &d->d); -} - -#else - const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE; static float32x4_t NOINLINE VPCS_ATTR @@ -78,7 +32,7 @@ special_case (float32x4_t x, uint32x4_t cmp) error is 1.63 ULP: _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3 want 0x1.fdcb16p-3. */ -VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) { uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)), vcaleq_f32 (x, v_f32 (0x1p127f))); @@ -88,9 +42,6 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) return log1pf_inline (x, ptr_barrier (&data)); } - -#endif - libmvec_hidden_def (V_NAME_F1 (log1p)) HALF_WIDTH_ALIAS_F1 (log1p) strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1)) diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c index e5bb4f0306..e1b3b20f31 100644 --- a/sysdeps/aarch64/fpu/pow_advsimd.c +++ b/sysdeps/aarch64/fpu/pow_advsimd.c @@ -208,23 +208,13 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) uint64x2_t viy = vreinterpretq_u64_f64 (y); uint64x2_t iay = vandq_u64 (viy, d->inf); - /* Special cases of x or y. */ -#if WANT_SIMD_EXCEPT - /* Small or large. */ - uint64x2_t vtopx = vshrq_n_u64 (vix, 52); - uint64x2_t vabstopy = vshrq_n_u64 (iay, 52); - uint64x2_t specialx - = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); - uint64x2_t specialy - = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); -#else - /* The case y==0 does not trigger a special case, since in this case it is + /* Special cases of x or y. + The case y==0 does not trigger a special case, since in this case it is necessary to fix the result only if x is a signalling nan, which already triggers a special case. We test y==0 directly in the scalar fallback. */ uint64x2_t iax = vandq_u64 (vix, d->inf); uint64x2_t specialx = vcgeq_u64 (iax, d->inf); uint64x2_t specialy = vcgeq_u64 (iay, d->inf); -#endif uint64x2_t special = vorrq_u64 (specialx, specialy); /* Fallback to scalar on all lanes if any lane is inf or nan. */ if (__glibc_unlikely (v_any_u64 (special))) diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c index 4e5118d338..e8a59a4d39 100644 --- a/sysdeps/aarch64/fpu/sin_advsimd.c +++ b/sysdeps/aarch64/fpu/sin_advsimd.c @@ -36,13 +36,6 @@ static const struct data .pi_3 = V2 (0x1.c1cd129024e09p-106), }; -#if WANT_SIMD_EXCEPT -/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ -# define TinyBound v_u64 (0x3020000000000000) -/* RangeVal - TinyBound. */ -# define Thresh v_u64 (0x1160000000000000) -#endif - #define C(i) d->poly[i] static float64x2_t VPCS_ATTR NOINLINE @@ -67,24 +60,14 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; uint64x2_t odd, cmp; -#if WANT_SIMD_EXCEPT - /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be - triggered correctly, set any special lanes to 1 (which is neutral w.r.t. - fenv). These lanes will be fixed by special-case handler later. */ - uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); - cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); - r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); -#else - r = x; cmp = vcageq_f64 (x, d->range_val); -#endif /* n = rint(|x|/pi). */ - n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); + n = vrndaq_f64 (vmulq_f64 (x, d->inv_pi)); odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (x, d->pi_1, n); r = vfmsq_f64 (r, d->pi_2, n); r = vfmsq_f64 (r, d->pi_3, n); diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c index 4cda651410..25442badfb 100644 --- a/sysdeps/aarch64/fpu/sinf_advsimd.c +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c @@ -36,13 +36,6 @@ static const struct data .range_val = V4 (0x1p20f) }; -#if WANT_SIMD_EXCEPT -/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ -# define TinyBound v_u32 (0x22000000) -/* RangeVal - TinyBound. */ -# define Thresh v_u32 (0x27800000) -#endif - #define C(i) d->poly[i] static float32x4_t VPCS_ATTR NOINLINE @@ -58,25 +51,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) const struct data *d = ptr_barrier (&data); float32x4_t n, r, r2, y; uint32x4_t odd, cmp; - -#if WANT_SIMD_EXCEPT - uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - special-case handler later. */ - r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); -#else - r = x; cmp = vcageq_f32 (x, d->range_val); -#endif /* n = rint(|x|/pi). */ - n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); + n = vrndaq_f32 (vmulq_f32 (x, d->inv_pi)); odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (x, d->pi_1, n); r = vfmsq_f32 (r, d->pi_2, n); r = vfmsq_f32 (r, d->pi_3, n); diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c index 41d1910d44..dc4a637924 100644 --- a/sysdeps/aarch64/fpu/sinhf_advsimd.c +++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c @@ -23,22 +23,11 @@ static const struct data { struct v_expm1f_data expm1f_consts; -#if WANT_SIMD_EXCEPT - uint32x4_t tiny_bound, thresh; -#else float32x4_t oflow_bound; -#endif } data = { .expm1f_consts = V_EXPM1F_DATA, -#if WANT_SIMD_EXCEPT - /* 0x1.6a09e8p-32, below which expm1f underflows. */ - .tiny_bound = V4 (0x2fb504f4), - /* asuint(oflow_bound) - asuint(tiny_bound). */ - .thresh = V4 (0x12fbbbb3), -#else /* 0x1.61814ep+6, above which expm1f helper overflows. */ .oflow_bound = V4 (0x1.61814ep+6), -#endif }; static float32x4_t NOINLINE VPCS_ATTR @@ -62,13 +51,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) float32x4_t halfsign = vreinterpretq_f32_u32 ( vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5)))); -#if WANT_SIMD_EXCEPT - uint32x4_t special = vcgeq_u32 ( - vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh); - ax = v_zerofy_f32 (ax, special); -#else uint32x4_t special = vcageq_f32 (x, d->oflow_bound); -#endif /* Up to the point that expm1f overflows, we can use it to calculate sinhf using a slight rearrangement of the definition of asinh. This allows us diff --git a/sysdeps/aarch64/fpu/sinpi_advsimd.c b/sysdeps/aarch64/fpu/sinpi_advsimd.c index 6965644833..1521d4e9ff 100644 --- a/sysdeps/aarch64/fpu/sinpi_advsimd.c +++ b/sysdeps/aarch64/fpu/sinpi_advsimd.c @@ -33,20 +33,6 @@ static const struct data V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, }; -#if WANT_SIMD_EXCEPT -# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */ -/* asuint64(0x1p64) - TinyBound. */ -# define Thresh v_u64 (0x07f0000000000000) - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) -{ - /* Fall back to scalar code. */ - y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); - return v_call_f64 (sinpi, x, y, cmp); -} -#endif - /* Approximation for vector double-precision sinpi(x). Maximum Error 3.05 ULP: _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1 @@ -54,34 +40,17 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - -#if WANT_SIMD_EXCEPT - uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); - uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); - - /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 - to avoid them under/overflowing and throwing exceptions. */ - float64x2_t r = v_zerofy_f64 (x, cmp); -#else - float64x2_t r = x; -#endif - /* If r is odd, the sign of the result should be inverted. */ uint64x2_t odd - = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), 63); /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ - r = vsubq_f64 (r, vrndaq_f64 (r)); + float64x2_t r = vsubq_f64 (x, vrndaq_f64 (x)); /* y = sin(r). */ float64x2_t r2 = vmulq_f64 (r, r); float64x2_t r4 = vmulq_f64 (r2, r2); float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u64 (cmp))) - return special_case (x, y, odd, cmp); -#endif - return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } diff --git a/sysdeps/aarch64/fpu/sinpif_advsimd.c b/sysdeps/aarch64/fpu/sinpif_advsimd.c index 2e77aaa810..a2aa5fe5ba 100644 --- a/sysdeps/aarch64/fpu/sinpif_advsimd.c +++ b/sysdeps/aarch64/fpu/sinpif_advsimd.c @@ -29,19 +29,6 @@ static const struct data V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, }; -#if WANT_SIMD_EXCEPT -# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */ -# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) -{ - /* Fall back to scalar code. */ - y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); - return v_call_f32 (sinpif, x, y, cmp); -} -#endif - /* Approximation for vector single-precision sinpi(x) Maximum Error 3.03 ULP: _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 @@ -50,36 +37,19 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x) { const struct data *d = ptr_barrier (&data); -#if WANT_SIMD_EXCEPT - uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); - uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); - - /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 - to avoid them under/overflowing and throwing exceptions. */ - float32x4_t r = v_zerofy_f32 (x, cmp); -#else - float32x4_t r = x; -#endif - /* If r is odd, the sign of the result should be inverted. */ uint32x4_t odd - = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 31); /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ - r = vsubq_f32 (r, vrndaq_f32 (r)); + float32x4_t r = vsubq_f32 (x, vrndaq_f32 (x)); /* Pairwise Horner approximation for y = sin(r * pi). */ float32x4_t r2 = vmulq_f32 (r, r); float32x4_t r4 = vmulq_f32 (r2, r2); float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); -#if WANT_SIMD_EXCEPT - if (__glibc_unlikely (v_any_u32 (cmp))) - return special_case (x, y, odd, cmp); -#endif - return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } - libmvec_hidden_def (V_NAME_F1 (sinpi)) HALF_WIDTH_ALIAS_F1 (sinpi) diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c index a8adf16a09..fd56ee2e84 100644 --- a/sysdeps/aarch64/fpu/tanf_advsimd.c +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c @@ -25,9 +25,7 @@ static const struct data float32x4_t poly[6]; float pi_consts[4]; float32x4_t shift; -#if !WANT_SIMD_EXCEPT float32x4_t range_val; -#endif } data = { /* Coefficients generated using FPMinimax. */ .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), @@ -36,15 +34,9 @@ static const struct data .pi_consts = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, .shift = V4 (0x1.8p+23f), -#if !WANT_SIMD_EXCEPT .range_val = V4 (0x1p15f), -#endif }; -#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */ -#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */ -#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */ - /* Special cases (fall back to scalar calls). */ static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) @@ -57,15 +49,6 @@ static inline float32x4_t eval_poly (float32x4_t z, const struct data *d) { float32x4_t z2 = vmulq_f32 (z, z); -#if WANT_SIMD_EXCEPT - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. - If fp exceptions are to be triggered correctly, - sidestep this by fixing such lanes to 0. */ - uint32x4_t will_uflow - = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); - if (__glibc_unlikely (v_any_u32 (will_uflow))) - z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); -#endif float32x4_t z4 = vmulq_f32 (z2, z2); return v_estrin_5_f32 (z, z2, z4, d->poly); } @@ -79,20 +62,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) const struct data *d = ptr_barrier (&data); float32x4_t special_arg = x; - /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast - regression. */ -#if WANT_SIMD_EXCEPT - uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); - /* If fp exceptions are to be triggered correctly, also special-case tiny - input, as this will load to overflow later. Fix any special lanes to 1 to - prevent any exceptions being triggered. */ - uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh); - if (__glibc_unlikely (v_any_u32 (special))) - x = vbslq_f32 (special, v_f32 (1.0f), x); -#else - /* Otherwise, special-case large and special values. */ + /* Special-case large and special values. */ uint32x4_t special = vcageq_f32 (x, d->range_val); -#endif /* n = rint(x/(pi/2)). */ float32x4_t pi_consts = vld1q_f32 (d->pi_consts); diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c index 40acefd752..0adae883d4 100644 --- a/sysdeps/aarch64/fpu/tanh_advsimd.c +++ b/sysdeps/aarch64/fpu/tanh_advsimd.c @@ -48,21 +48,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); - float64x2_t u = x; - /* Trigger special-cases for tiny, boring and infinity/NaN. */ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); -#if WANT_SIMD_EXCEPT - /* To trigger fp exceptions correctly, set special lanes to a neutral value. - They will be fixed up later by the special-case handler. */ - if (__glibc_unlikely (v_any_u64 (special))) - u = v_zerofy_f64 (u, special); -#endif - - u = vaddq_f64 (u, u); /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - float64x2_t q = expm1_inline (u, &d->d); + float64x2_t twox = vaddq_f64 (x, x); + float64x2_t q = expm1_inline (twox, &d->d); float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0)); if (__glibc_unlikely (v_any_u64 (special))) diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c index f61a456bca..4f5d185777 100644 --- a/sysdeps/aarch64/fpu/tanhf_advsimd.c +++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c @@ -57,17 +57,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 ( sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias))); -#if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered properly, set all special and boring - lanes to 0, which will trigger no exceptions, and fix them up later. */ - uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound), - vcltq_u32 (iax, v_u32 (0x34000000))); - x = v_zerofy_f32 (x, is_boring); - if (__glibc_unlikely (v_any_u32 (special))) - x = v_zerofy_f32 (x, special); -#else uint32x4_t special = vcgtq_u32 (iax, d->large_bound); -#endif /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h index d38f8fb3c5..66d33724dc 100644 --- a/sysdeps/aarch64/fpu/v_log1p_inline.h +++ b/sysdeps/aarch64/fpu/v_log1p_inline.h @@ -113,7 +113,7 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d) /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is that the approximation is solely the polynomial. */ uint64x2_t k0 = vceqzq_f64 (k); - cm = v_zerofy_f64 (cm, k0); + cm = vbslq_f64 (k0, v_f64 (0), cm); f = vbslq_f64 (k0, x, f); #endif diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h index 35d1b56a65..8deb22e1fc 100644 --- a/sysdeps/aarch64/fpu/v_math.h +++ b/sysdeps/aarch64/fpu/v_math.h @@ -108,11 +108,6 @@ v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3] }; } -static inline float32x4_t -v_zerofy_f32 (float32x4_t x, uint32x4_t mask) -{ - return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); -} static inline float64x2_t v_f64 (double x) @@ -172,10 +167,5 @@ v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1] }; } -static inline float64x2_t -v_zerofy_f64 (float64x2_t x, uint64x2_t mask) -{ - return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); -} #endif diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h index 3af48525a9..2add10fd76 100644 --- a/sysdeps/aarch64/fpu/vecmath_config.h +++ b/sysdeps/aarch64/fpu/vecmath_config.h @@ -21,11 +21,6 @@ #include -/* Deprecated config option from Arm Optimized Routines which ensures - fp exceptions are correctly triggered. This is not intended to be - supported in GLIBC, however we keep it for ease of development. */ -#define WANT_SIMD_EXCEPT 0 - /* Return ptr but hide its value from the compiler so accesses through it cannot be optimized based on the contents. */ #define ptr_barrier(ptr) \