aarch64: Remove non-temporal load/stores from oryon-1's memcpy

The hardware architects have a new recommendation not to use non-temporal load/stores for memcpy. This patch removes this path. I found there was no difference in the memcpy speed with/without non-temporal load/stores either. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
2024-11-26 19:23:34 +08:00 · 2024-11-14 19:03:19 -08:00 · 2024-11-14 19:03:19 -08:00 · eb5eeb4740
commit eb5eeb4740
parent 3051f3495c
1 changed files with 0 additions and 40 deletions
--- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
@ -160,46 +160,6 @@ L(copy96):
 	.p2align 6
 L(copy_long):

-	/* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
-	   This loop is identical to the one below it but using ldnp/stnp
-	   instructions.  For loops that are less than 32768 bytes,
-	   the ldnp/stnp instructions will not help and will cause a slow
-	   down so only use the ldnp/stnp loop for the largest sizes.  */
-
-	cmp	count, #32768
-	b.lo	L(copy_long_without_nontemp)
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	ldnp	D_l, D_h, [src]
-	sub	src, src, tmp1
-	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldnp	A_l, A_h, [src, 16]
-	stnp	D_l, D_h, [dstin]
-	ldnp	B_l, B_h, [src, 32]
-	ldnp	C_l, C_h, [src, 48]
-	ldnp	D_l, D_h, [src, 64]
-	add	src, src, #64
-	subs	count, count, 128 + 16	/* Test and readjust count.  */
-
-L(nontemp_loop64):
-	tbz	src, #6, 1f
-1:
-	stnp	A_l, A_h, [dst, 16]
-	ldnp	A_l, A_h, [src, 16]
-	stnp	B_l, B_h, [dst, 32]
-	ldnp	B_l, B_h, [src, 32]
-	stnp	C_l, C_h, [dst, 48]
-	ldnp	C_l, C_h, [src, 48]
-	stnp	D_l, D_h, [dst, 64]
-	ldnp	D_l, D_h, [src, 64]
-	add	src, src, #64
-	add	dst, dst, #64
-	subs	count, count, 64
-	b.hi	L(nontemp_loop64)
-	b	L(last64)
-
-L(copy_long_without_nontemp):
-
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]