AArch64: Remove SVE erf and erfc tables

By using a combination of mask-and-add instead of the shift-based
index calculation the routines can share the same table as other
variants with no performance degradation.

The tables change name because of other changes in downstream AOR.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Joe Ramsay 2024-11-01 15:48:54 +00:00 committed by Wilco Dijkstra
parent 6d477b8de8
commit 2d82d781a5
16 changed files with 50 additions and 2691 deletions

View File

@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
v_log10_data \ v_log10_data \
erf_data \ erf_data \
erff_data \ erff_data \
sv_erf_data \
sv_erff_data \
v_exp_tail_data \ v_exp_tail_data \
erfc_data \ erfc_data \
erfcf_data \ erfcf_data \

View File

@ -58,8 +58,8 @@ static inline struct entry
lookup (uint64x2_t i) lookup (uint64x2_t i)
{ {
struct entry e; struct entry e;
float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf), float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf); e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
e.erf = vuzp1q_f64 (e1, e2); e.erf = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2);
return e; return e;

View File

@ -19,14 +19,14 @@
#include "vecmath_config.h" #include "vecmath_config.h"
/* Lookup table used in erf. /* Lookup table used in vector erf.
For each possible rounded input r (multiples of 1/128), between For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = 6.0 (769 values): r = 0.0 and r = 6.0 (769 values):
- the first entry __erff_data.tab.erf contains the values of erf(r), - the first entry __v_erff_data.tab.erf contains the values of erf(r),
- the second entry __erff_data.tab.scale contains the values of - the second entry __v_erff_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
algorithm, since lookup is performed only for x >= 1/64-1/512. */ algorithm, since lookup is performed only for x >= 1/64-1/512. */
const struct erf_data __erf_data = { const struct v_erf_data __v_erf_data = {
.tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
{ 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
{ 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },

View File

@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
svfloat64_t a = svabs_x (pg, x); svfloat64_t a = svabs_x (pg, x);
svfloat64_t shift = sv_f64 (dat->shift); svfloat64_t shift = sv_f64 (dat->shift);
svfloat64_t z = svadd_x (pg, a, shift); svfloat64_t z = svadd_x (pg, a, shift);
svuint64_t i svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
= svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift)); i = svadd_x (pg, i, i);
/* Lookup without shortcut for small values but with predicate to avoid /* Lookup without shortcut for small values but with predicate to avoid
segfault for large values and NaNs. */ segfault for large values and NaNs. */
svfloat64_t r = svsub_x (pg, z, shift); svfloat64_t r = svsub_x (pg, z, shift);
svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i); svfloat64_t erfr
svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i); = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
svfloat64_t scale
= svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);
/* erf(x) ~ erf(r) + scale * d * poly (r, d). */ /* erf(x) ~ erf(r) + scale * d * poly (r, d). */
svfloat64_t d = svsub_x (pg, a, r); svfloat64_t d = svsub_x (pg, a, r);

View File

@ -69,9 +69,9 @@ lookup (uint64x2_t i)
{ {
struct entry e; struct entry e;
float64x2_t e1 float64x2_t e1
= vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
float64x2_t e2 float64x2_t e2
= vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
e.erfc = vuzp1q_f64 (e1, e2); e.erfc = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2);
return e; return e;

View File

@ -19,14 +19,14 @@
#include "vecmath_config.h" #include "vecmath_config.h"
/* Lookup table used in erfc. /* Lookup table used in vector erfc.
For each possible rounded input r (multiples of 1/128), between For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = ~27.0 (3488 values): r = 0.0 and r = ~27.0 (3488 values):
- the first entry __erfc_data.tab.erfc contains the values of erfc(r), - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
- the second entry __erfc_data.tab.scale contains the values of - the second entry __v_erfc_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^128 (fits in 8bit). */ they are scaled by a large enough value 2^128 (fits in 8bit). */
const struct erfc_data __erfc_data = { const struct v_erfc_data __v_erfc_data = {
.tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, .tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
{ 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
{ 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },

View File

@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svadd_x (pg, i, i); i = svadd_x (pg, i, i);
const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr; const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
svfloat64_t erfcr = svld1_gather_index (pg, p, i); svfloat64_t erfcr = svld1_gather_index (pg, p, i);
svfloat64_t scale = svld1_gather_index (pg, p + 1, i); svfloat64_t scale = svld1_gather_index (pg, p + 1, i);

View File

@ -62,13 +62,13 @@ lookup (uint32x4_t i)
{ {
struct entry e; struct entry e;
float32x2_t t0 float32x2_t t0
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
float32x2_t t1 float32x2_t t1
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
float32x2_t t2 float32x2_t t2
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
float32x2_t t3 float32x2_t t3
= vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
float32x4_t e1 = vcombine_f32 (t0, t1); float32x4_t e1 = vcombine_f32 (t0, t1);
float32x4_t e2 = vcombine_f32 (t2, t3); float32x4_t e2 = vcombine_f32 (t2, t3);
e.erfc = vuzp1q_f32 (e1, e2); e.erfc = vuzp1q_f32 (e1, e2);

View File

@ -19,14 +19,14 @@
#include "vecmath_config.h" #include "vecmath_config.h"
/* Lookup table used in erfcf. /* Lookup table used in vector erfcf.
For each possible rounded input r (multiples of 1/64), between For each possible rounded input r (multiples of 1/64), between
r = 0.0 and r = 10.0625 (645 values): r = 0.0 and r = 10.0625 (645 values):
- the first entry __erfcf_data.tab.erfc contains the values of erfc(r), - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
- the second entry __erfcf_data.tab.scale contains the values of - the second entry __v_erfcf_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^47 (fits in 8 bits). */ they are scaled by a large enough value 2^47 (fits in 8 bits). */
const struct erfcf_data __erfcf_data = { const struct v_erfcf_data __v_erfcf_data = {
.tab = { { 0x1p47, 0x1.20dd76p47 }, .tab = { { 0x1p47, 0x1.20dd76p47 },
{ 0x1.f6f944p46, 0x1.20cb68p47 }, { 0x1.f6f944p46, 0x1.20cb68p47 },
{ 0x1.edf3aap46, 0x1.209546p47 }, { 0x1.edf3aap46, 0x1.209546p47 },

View File

@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svmul_x (pg, i, 2); i = svmul_x (pg, i, 2);
const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr; const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
svfloat32_t erfcr = svld1_gather_index (pg, p, i); svfloat32_t erfcr = svld1_gather_index (pg, p, i);
svfloat32_t scale = svld1_gather_index (pg, p + 1, i); svfloat32_t scale = svld1_gather_index (pg, p + 1, i);

View File

@ -47,10 +47,10 @@ static inline struct entry
lookup (uint32x4_t i) lookup (uint32x4_t i)
{ {
struct entry e; struct entry e;
float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf); float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf); float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf); float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf); float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
float32x4_t e1 = vcombine_f32 (t0, t1); float32x4_t e1 = vcombine_f32 (t0, t1);
float32x4_t e2 = vcombine_f32 (t2, t3); float32x4_t e2 = vcombine_f32 (t2, t3);
e.erf = vuzp1q_f32 (e1, e2); e.erf = vuzp1q_f32 (e1, e2);

View File

@ -19,14 +19,14 @@
#include "vecmath_config.h" #include "vecmath_config.h"
/* Lookup table used in erff. /* Lookup table used in vector erff.
For each possible rounded input r (multiples of 1/128), between For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = 4.0 (513 values): r = 0.0 and r = 4.0 (513 values):
- the first entry __erff_data.tab.erf contains the values of erf(r), - the first entry __v_erff_data.tab.erf contains the values of erf(r),
- the second entry __erff_data.tab.scale contains the values of - the second entry __v_erff_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
algorithm, since lookup is performed only for x >= 1/64-1/512. */ algorithm, since lookup is performed only for x >= 1/64-1/512. */
const struct erff_data __erff_data = { const struct v_erff_data __v_erff_data = {
.tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
{ 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, { 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
{ 0x1.20d770p-6, 0x1.20cb68p+0 }, { 0x1.20d770p-6, 0x1.20cb68p+0 },

View File

@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
svfloat32_t shift = sv_f32 (dat->shift); svfloat32_t shift = sv_f32 (dat->shift);
svfloat32_t z = svadd_x (pg, a, shift); svfloat32_t z = svadd_x (pg, a, shift);
svuint32_t i svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
= svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift)); i = svadd_x (pg, i, i);
/* Saturate lookup index. */
i = svsel (a_ge_max, sv_u32 (512), i);
/* r and erf(r) set to 0 for |x| below min. */ /* r and erf(r) set to 0 for |x| below min. */
svfloat32_t r = svsub_z (a_gt_min, z, shift); svfloat32_t r = svsub_z (a_gt_min, z, shift);
svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i); svfloat32_t erfr
= svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);
/* scale set to 2/sqrt(pi) for |x| below min. */ /* scale set to 2/sqrt(pi) for |x| below min. */
svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i); svfloat32_t scale
= svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
/* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -75,49 +75,37 @@ extern const struct v_log10_data
} table[1 << V_LOG10_TABLE_BITS]; } table[1 << V_LOG10_TABLE_BITS];
} __v_log10_data attribute_hidden; } __v_log10_data attribute_hidden;
extern const struct erff_data extern const struct v_erff_data
{ {
struct struct
{ {
float erf, scale; float erf, scale;
} tab[513]; } tab[513];
} __erff_data attribute_hidden; } __v_erff_data attribute_hidden;
extern const struct sv_erff_data extern const struct v_erf_data
{
float erf[513];
float scale[513];
} __sv_erff_data attribute_hidden;
extern const struct erf_data
{ {
struct struct
{ {
double erf, scale; double erf, scale;
} tab[769]; } tab[769];
} __erf_data attribute_hidden; } __v_erf_data attribute_hidden;
extern const struct sv_erf_data extern const struct v_erfc_data
{
double erf[769];
double scale[769];
} __sv_erf_data attribute_hidden;
extern const struct erfc_data
{ {
struct struct
{ {
double erfc, scale; double erfc, scale;
} tab[3488]; } tab[3488];
} __erfc_data attribute_hidden; } __v_erfc_data attribute_hidden;
extern const struct erfcf_data extern const struct v_erfcf_data
{ {
struct struct
{ {
float erfc, scale; float erfc, scale;
} tab[645]; } tab[645];
} __erfcf_data attribute_hidden; } __v_erfcf_data attribute_hidden;
/* Some data for AdvSIMD and SVE pow's internal exp and log. */ /* Some data for AdvSIMD and SVE pow's internal exp and log. */
#define V_POW_EXP_TABLE_BITS 8 #define V_POW_EXP_TABLE_BITS 8