aarch64: Remove non-temporal load/stores from oryon-1's memcpy

The hardware architects have a new recommendation not to use
non-temporal load/stores for memcpy. This patch removes this path.
I found there was no difference in the memcpy speed with/without
non-temporal load/stores either.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
This commit is contained in:
Andrew Pinski 2024-11-14 19:03:19 -08:00 committed by Adhemerval Zanella
parent 3051f3495c
commit eb5eeb4740

View File

@ -160,46 +160,6 @@ L(copy96):
.p2align 6
L(copy_long):
/* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
This loop is identical to the one below it but using ldnp/stnp
instructions. For loops that are less than 32768 bytes,
the ldnp/stnp instructions will not help and will cause a slow
down so only use the ldnp/stnp loop for the largest sizes. */
cmp count, #32768
b.lo L(copy_long_without_nontemp)
and tmp1, dstin, 15
bic dst, dstin, 15
ldnp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldnp A_l, A_h, [src, 16]
stnp D_l, D_h, [dstin]
ldnp B_l, B_h, [src, 32]
ldnp C_l, C_h, [src, 48]
ldnp D_l, D_h, [src, 64]
add src, src, #64
subs count, count, 128 + 16 /* Test and readjust count. */
L(nontemp_loop64):
tbz src, #6, 1f
1:
stnp A_l, A_h, [dst, 16]
ldnp A_l, A_h, [src, 16]
stnp B_l, B_h, [dst, 32]
ldnp B_l, B_h, [src, 32]
stnp C_l, C_h, [dst, 48]
ldnp C_l, C_h, [src, 48]
stnp D_l, D_h, [dst, 64]
ldnp D_l, D_h, [src, 64]
add src, src, #64
add dst, dst, #64
subs count, count, 64
b.hi L(nontemp_loop64)
b L(last64)
L(copy_long_without_nontemp):
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]