mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-23 01:33:36 +08:00
AArch64: Remove SVE erf and erfc tables
By using a combination of mask-and-add instead of the shift-based index calculation the routines can share the same table as other variants with no performance degradation. The tables change name because of other changes in downstream AOR. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
6d477b8de8
commit
2d82d781a5
@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
|
||||
v_log10_data \
|
||||
erf_data \
|
||||
erff_data \
|
||||
sv_erf_data \
|
||||
sv_erff_data \
|
||||
v_exp_tail_data \
|
||||
erfc_data \
|
||||
erfcf_data \
|
||||
|
@ -58,8 +58,8 @@ static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
|
||||
e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
|
||||
e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
|
||||
e.erf = vuzp1q_f64 (e1, e2);
|
||||
e.scale = vuzp2q_f64 (e1, e2);
|
||||
return e;
|
||||
|
@ -19,14 +19,14 @@
|
||||
|
||||
#include "vecmath_config.h"
|
||||
|
||||
/* Lookup table used in erf.
|
||||
/* Lookup table used in vector erf.
|
||||
For each possible rounded input r (multiples of 1/128), between
|
||||
r = 0.0 and r = 6.0 (769 values):
|
||||
- the first entry __erff_data.tab.erf contains the values of erf(r),
|
||||
- the second entry __erff_data.tab.scale contains the values of
|
||||
- the first entry __v_erff_data.tab.erf contains the values of erf(r),
|
||||
- the second entry __v_erff_data.tab.scale contains the values of
|
||||
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
|
||||
algorithm, since lookup is performed only for x >= 1/64-1/512. */
|
||||
const struct erf_data __erf_data = {
|
||||
const struct v_erf_data __v_erf_data = {
|
||||
.tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
|
||||
{ 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
|
||||
{ 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
|
||||
|
@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t a = svabs_x (pg, x);
|
||||
svfloat64_t shift = sv_f64 (dat->shift);
|
||||
svfloat64_t z = svadd_x (pg, a, shift);
|
||||
svuint64_t i
|
||||
= svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
|
||||
svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
|
||||
i = svadd_x (pg, i, i);
|
||||
|
||||
/* Lookup without shortcut for small values but with predicate to avoid
|
||||
segfault for large values and NaNs. */
|
||||
svfloat64_t r = svsub_x (pg, z, shift);
|
||||
svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
|
||||
svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
|
||||
svfloat64_t erfr
|
||||
= svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
|
||||
svfloat64_t scale
|
||||
= svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);
|
||||
|
||||
/* erf(x) ~ erf(r) + scale * d * poly (r, d). */
|
||||
svfloat64_t d = svsub_x (pg, a, r);
|
||||
|
@ -69,9 +69,9 @@ lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float64x2_t e1
|
||||
= vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
|
||||
= vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
|
||||
float64x2_t e2
|
||||
= vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
|
||||
= vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
|
||||
e.erfc = vuzp1q_f64 (e1, e2);
|
||||
e.scale = vuzp2q_f64 (e1, e2);
|
||||
return e;
|
||||
|
@ -19,14 +19,14 @@
|
||||
|
||||
#include "vecmath_config.h"
|
||||
|
||||
/* Lookup table used in erfc.
|
||||
/* Lookup table used in vector erfc.
|
||||
For each possible rounded input r (multiples of 1/128), between
|
||||
r = 0.0 and r = ~27.0 (3488 values):
|
||||
- the first entry __erfc_data.tab.erfc contains the values of erfc(r),
|
||||
- the second entry __erfc_data.tab.scale contains the values of
|
||||
- the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
|
||||
- the second entry __v_erfc_data.tab.scale contains the values of
|
||||
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
|
||||
they are scaled by a large enough value 2^128 (fits in 8bit). */
|
||||
const struct erfc_data __erfc_data = {
|
||||
const struct v_erfc_data __v_erfc_data = {
|
||||
.tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
|
||||
{ 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
|
||||
{ 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
|
||||
|
@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
|
||||
|
||||
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
|
||||
i = svadd_x (pg, i, i);
|
||||
const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
|
||||
const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
|
||||
svfloat64_t erfcr = svld1_gather_index (pg, p, i);
|
||||
svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
|
||||
|
||||
|
@ -62,13 +62,13 @@ lookup (uint32x4_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float32x2_t t0
|
||||
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
|
||||
float32x2_t t1
|
||||
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
|
||||
float32x2_t t2
|
||||
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
|
||||
float32x2_t t3
|
||||
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
|
||||
= vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
|
||||
float32x4_t e1 = vcombine_f32 (t0, t1);
|
||||
float32x4_t e2 = vcombine_f32 (t2, t3);
|
||||
e.erfc = vuzp1q_f32 (e1, e2);
|
||||
|
@ -19,14 +19,14 @@
|
||||
|
||||
#include "vecmath_config.h"
|
||||
|
||||
/* Lookup table used in erfcf.
|
||||
/* Lookup table used in vector erfcf.
|
||||
For each possible rounded input r (multiples of 1/64), between
|
||||
r = 0.0 and r = 10.0625 (645 values):
|
||||
- the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
|
||||
- the second entry __erfcf_data.tab.scale contains the values of
|
||||
- the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
|
||||
- the second entry __v_erfcf_data.tab.scale contains the values of
|
||||
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
|
||||
they are scaled by a large enough value 2^47 (fits in 8 bits). */
|
||||
const struct erfcf_data __erfcf_data = {
|
||||
const struct v_erfcf_data __v_erfcf_data = {
|
||||
.tab = { { 0x1p47, 0x1.20dd76p47 },
|
||||
{ 0x1.f6f944p46, 0x1.20cb68p47 },
|
||||
{ 0x1.edf3aap46, 0x1.209546p47 },
|
||||
|
@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
|
||||
|
||||
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
|
||||
i = svmul_x (pg, i, 2);
|
||||
const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
|
||||
const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
|
||||
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
|
||||
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
|
||||
|
||||
|
@ -47,10 +47,10 @@ static inline struct entry
|
||||
lookup (uint32x4_t i)
|
||||
{
|
||||
struct entry e;
|
||||
float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
|
||||
float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
|
||||
float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
|
||||
float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
|
||||
float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
|
||||
float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
|
||||
float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
|
||||
float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
|
||||
float32x4_t e1 = vcombine_f32 (t0, t1);
|
||||
float32x4_t e2 = vcombine_f32 (t2, t3);
|
||||
e.erf = vuzp1q_f32 (e1, e2);
|
||||
|
@ -19,14 +19,14 @@
|
||||
|
||||
#include "vecmath_config.h"
|
||||
|
||||
/* Lookup table used in erff.
|
||||
/* Lookup table used in vector erff.
|
||||
For each possible rounded input r (multiples of 1/128), between
|
||||
r = 0.0 and r = 4.0 (513 values):
|
||||
- the first entry __erff_data.tab.erf contains the values of erf(r),
|
||||
- the second entry __erff_data.tab.scale contains the values of
|
||||
- the first entry __v_erff_data.tab.erf contains the values of erf(r),
|
||||
- the second entry __v_erff_data.tab.scale contains the values of
|
||||
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
|
||||
algorithm, since lookup is performed only for x >= 1/64-1/512. */
|
||||
const struct erff_data __erff_data = {
|
||||
const struct v_erff_data __v_erff_data = {
|
||||
.tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
|
||||
{ 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
|
||||
{ 0x1.20d770p-6, 0x1.20cb68p+0 },
|
||||
|
@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
|
||||
|
||||
svfloat32_t shift = sv_f32 (dat->shift);
|
||||
svfloat32_t z = svadd_x (pg, a, shift);
|
||||
svuint32_t i
|
||||
= svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
|
||||
|
||||
/* Saturate lookup index. */
|
||||
i = svsel (a_ge_max, sv_u32 (512), i);
|
||||
svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
|
||||
i = svadd_x (pg, i, i);
|
||||
|
||||
/* r and erf(r) set to 0 for |x| below min. */
|
||||
svfloat32_t r = svsub_z (a_gt_min, z, shift);
|
||||
svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
|
||||
svfloat32_t erfr
|
||||
= svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);
|
||||
|
||||
/* scale set to 2/sqrt(pi) for |x| below min. */
|
||||
svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
|
||||
svfloat32_t scale
|
||||
= svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
|
||||
scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
|
||||
|
||||
/* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -75,49 +75,37 @@ extern const struct v_log10_data
|
||||
} table[1 << V_LOG10_TABLE_BITS];
|
||||
} __v_log10_data attribute_hidden;
|
||||
|
||||
extern const struct erff_data
|
||||
extern const struct v_erff_data
|
||||
{
|
||||
struct
|
||||
{
|
||||
float erf, scale;
|
||||
} tab[513];
|
||||
} __erff_data attribute_hidden;
|
||||
} __v_erff_data attribute_hidden;
|
||||
|
||||
extern const struct sv_erff_data
|
||||
{
|
||||
float erf[513];
|
||||
float scale[513];
|
||||
} __sv_erff_data attribute_hidden;
|
||||
|
||||
extern const struct erf_data
|
||||
extern const struct v_erf_data
|
||||
{
|
||||
struct
|
||||
{
|
||||
double erf, scale;
|
||||
} tab[769];
|
||||
} __erf_data attribute_hidden;
|
||||
} __v_erf_data attribute_hidden;
|
||||
|
||||
extern const struct sv_erf_data
|
||||
{
|
||||
double erf[769];
|
||||
double scale[769];
|
||||
} __sv_erf_data attribute_hidden;
|
||||
|
||||
extern const struct erfc_data
|
||||
extern const struct v_erfc_data
|
||||
{
|
||||
struct
|
||||
{
|
||||
double erfc, scale;
|
||||
} tab[3488];
|
||||
} __erfc_data attribute_hidden;
|
||||
} __v_erfc_data attribute_hidden;
|
||||
|
||||
extern const struct erfcf_data
|
||||
extern const struct v_erfcf_data
|
||||
{
|
||||
struct
|
||||
{
|
||||
float erfc, scale;
|
||||
} tab[645];
|
||||
} __erfcf_data attribute_hidden;
|
||||
} __v_erfcf_data attribute_hidden;
|
||||
|
||||
/* Some data for AdvSIMD and SVE pow's internal exp and log. */
|
||||
#define V_POW_EXP_TABLE_BITS 8
|
||||
|
Loading…
Reference in New Issue
Block a user