x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h

No bug.

This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.

On Skylake with ERMS:

Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096,   0,      0,      0,      0.975
4096,   0,      0,      1,      0.953
4096,   12,     0,      0,      0.969
4096,   12,     0,      1,      0.872
4096,   44,     0,      0,      0.979
4096,   44,     0,      1,      0.83
4096,   0,      12,     0,      1.006
4096,   0,      12,     1,      0.989
4096,   0,      44,     0,      0.739
4096,   0,      44,     1,      0.942
4096,   12,     12,     0,      1.009
4096,   12,     12,     1,      0.973
4096,   44,     44,     0,      0.791
4096,   44,     44,     1,      0.961
4096,   2048,   0,      0,      0.978
4096,   2048,   0,      1,      0.951
4096,   2060,   0,      0,      0.986
4096,   2060,   0,      1,      0.963
4096,   2048,   12,     0,      0.971
4096,   2048,   12,     1,      0.941
4096,   2060,   12,     0,      0.977
4096,   2060,   12,     1,      0.949
8192,   0,      0,      0,      0.85
8192,   0,      0,      1,      0.845
8192,   13,     0,      0,      0.937
8192,   13,     0,      1,      0.939
8192,   45,     0,      0,      0.932
8192,   45,     0,      1,      0.927
8192,   0,      13,     0,      0.621
8192,   0,      13,     1,      0.62
8192,   0,      45,     0,      0.53
8192,   0,      45,     1,      0.516
8192,   13,     13,     0,      0.664
8192,   13,     13,     1,      0.659
8192,   45,     45,     0,      0.593
8192,   45,     45,     1,      0.575
8192,   2048,   0,      0,      0.854
8192,   2048,   0,      1,      0.834
8192,   2061,   0,      0,      0.863
8192,   2061,   0,      1,      0.857
8192,   2048,   13,     0,      0.63
8192,   2048,   13,     1,      0.629
8192,   2061,   13,     0,      0.627
8192,   2061,   13,     1,      0.62

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Noah Goldstein 2021-11-01 00:49:52 -05:00
parent a6b7502ec0
commit 475b63702e
2 changed files with 20 additions and 14 deletions

View File

@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
unsigned int minimum_rep_movsb_threshold;
#endif
/* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
/* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
threshold is 2048 * (VEC_SIZE / 16). */
unsigned int rep_movsb_threshold;
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
{
rep_movsb_threshold = 2048 * (64 / 16);
rep_movsb_threshold = 4096 * (64 / 16);
#if HAVE_TUNABLES
minimum_rep_movsb_threshold = 64 * 8;
#endif
@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
AVX_Fast_Unaligned_Load))
{
rep_movsb_threshold = 2048 * (32 / 16);
rep_movsb_threshold = 4096 * (32 / 16);
#if HAVE_TUNABLES
minimum_rep_movsb_threshold = 32 * 8;
#endif

View File

@ -32,17 +32,21 @@ glibc {
}
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP MOVSB
# isn't faster on short data. The memcpy micro benchmark in glibc
# shows that 2KB is the approximate value above which REP MOVSB
# becomes faster than SSE2 optimization on processors with Enhanced
# REP MOVSB. Since larger register size can move more data with a
# single load and store, the threshold is higher with larger register
# size. Note: Since the REP MOVSB threshold must be greater than 8
# times of vector size and the default value is 2048 * (vector size
# / 16), the default value and the minimum value must be updated at
# run-time. NB: Don't set the default value since we can't tell if
# the tunable value is set by user or not [BZ #27069].
# Since there is overhead to set up REP MOVSB operation, REP
# MOVSB isn't faster on short data. The memcpy micro benchmark
# in glibc shows that 2KB is the approximate value above which
# REP MOVSB becomes faster than SSE2 optimization on processors
# with Enhanced REP MOVSB. Since larger register size can move
# more data with a single load and store, the threshold is
# higher with larger register size. Micro benchmarks show AVX
# REP MOVSB becomes faster apprximately at 8KB. The AVX512
# threshold is extrapolated to 16KB. For machines with FSRM the
# threshold is universally set at 2112 bytes. Note: Since the
# REP MOVSB threshold must be greater than 8 times of vector
# size and the default value is 4096 * (vector size / 16), the
# default value and the minimum value must be updated at
# run-time. NB: Don't set the default value since we can't tell
# if the tunable value is set by user or not [BZ #27069].
minval: 1
}
x86_rep_stosb_threshold {