AArch64: Optimize memcmp

Rewrite memcmp to improve performance. On small and medium inputs performance
is 10-20% better. Large inputs use a SIMD loop processing 64 bytes per
iteration, which is 30-50% faster depending on the size.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
This commit is contained in:
Wilco Dijkstra 2021-12-02 18:30:55 +00:00
parent d120fb9941
commit b51eb35c57

View File

@ -22,105 +22,79 @@
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data1h x4
#define data2 x5
#define data2w w5
#define data2h x6
#define tmp1 x7
#define tmp2 x8
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define data3 x5
#define data3w w5
#define data4 x6
#define data4w w6
#define tmp x6
#define src1end x7
#define src2end x8
ENTRY_ALIGN (memcmp, 6)
ENTRY (memcmp)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
subs limit, limit, 16
cmp limit, 16
b.lo L(less16)
ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16
ldp data1, data3, [src1]
ldp data2, data4, [src2]
ccmp data1, data2, 0, ne
ccmp data1h, data2h, 0, eq
b.ne L(return64)
ccmp data3, data4, 0, eq
b.ne L(return2)
subs limit, limit, 16
add src1end, src1, limit
add src2end, src2, limit
cmp limit, 32
b.ls L(last_bytes)
cmp limit, 112
b.lo L(loop16)
cmp limit, 160
b.hs L(loop_align)
sub limit, limit, 32
and tmp1, src1, 15
add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
subs limit, limit, 48
/* Compare 128 up bytes using aligned access. */
.p2align 4
L(loop64):
ldp data1, data1h, [src1]
ldp data2, data2h, [src2]
L(loop32):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
ccmp data1h, data2h, 0, eq
b.ne L(return64)
ccmp data3, data4, 0, eq
b.ne L(return2)
cmp limit, 16
b.ls L(last_bytes)
ldp data1, data1h, [src1, 16]
ldp data2, data2h, [src2, 16]
ldp data1, data3, [src1, 32]
ldp data2, data4, [src2, 32]
cmp data1, data2
ccmp data1h, data2h, 0, eq
b.ne L(return64)
ccmp data3, data4, 0, eq
b.ne L(return2)
add src1, src1, 32
add src2, src2, 32
L(last64):
subs limit, limit, 32
b.hi L(loop32)
ldp data1, data1h, [src1, 32]
ldp data2, data2h, [src2, 32]
cmp data1, data2
ccmp data1h, data2h, 0, eq
b.ne L(return64)
ldp data1, data1h, [src1, 48]
ldp data2, data2h, [src2, 48]
cmp data1, data2
ccmp data1h, data2h, 0, eq
b.ne L(return64)
subs limit, limit, 64
add src1, src1, 64
add src2, src2, 64
b.pl L(loop64)
adds limit, limit, 48
b.lo L(last_bytes)
L(loop16):
ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16
cmp data1, data2
ccmp data1h, data2h, 0, eq
b.ne L(return64)
subs limit, limit, 16
b.hi L(loop16)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
add src1, src1, limit
add src2, src2, limit
ldp data1, data1h, [src1]
ldp data2, data2h, [src2]
ldp data1, data3, [src1end, -16]
ldp data2, data4, [src2end, -16]
L(return2):
cmp data1, data2
csel data1, data1, data3, ne
csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return64):
cmp data1, data2
csel data1, data1, data1h, ne
csel data2, data2, data2h, ne
L(return):
#ifndef __AARCH64EB__
rev data1, data1
@ -133,45 +107,98 @@ L(return):
.p2align 4
L(less16):
adds limit, limit, 8
b.lo L(less8) //lo:<
add src1end, src1, limit
add src2end, src2, limit
tbz limit, 3, L(less8)
ldr data1, [src1]
ldr data2, [src2]
/* equal 8 optimized */
ccmp data1, data2, 0, ne
b.ne L(return)
ldr data1, [src1, limit]
ldr data2, [src2, limit]
b L(return)
ldr data3, [src1end, -8]
ldr data4, [src2end, -8]
b L(return2)
.p2align 4
L(less8):
adds limit, limit, 4
b.lo L(less4)
tbz limit, 2, L(less4)
ldr data1w, [src1]
ldr data2w, [src2]
ccmp data1w, data2w, 0, ne
ldr data3w, [src1end, -4]
ldr data4w, [src2end, -4]
b L(return2)
L(less4):
tbz limit, 1, L(less2)
ldrh data1w, [src1]
ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
ldr data1w, [src1, limit]
ldr data2w, [src2, limit]
b L(return)
L(less2):
mov result, 0
tbz limit, 0, L(return_zero)
ldrb data1w, [src1end, -1]
ldrb data2w, [src2end, -1]
sub result, data1w, data2w
L(return_zero):
ret
L(loop_align):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
ccmp data3, data4, 0, eq
b.ne L(return2)
/* Align src2 and adjust src1, src2 and limit. */
and tmp, src2, 15
sub tmp, tmp, 16
sub src2, src2, tmp
add limit, limit, tmp
sub src1, src1, tmp
sub limit, limit, 64 + 16
.p2align 4
L(less4):
adds limit, limit, 4
b.eq L(ret_0)
L(loop64):
ldr q0, [src1, 16]
ldr q1, [src2, 16]
subs limit, limit, 64
ldr q2, [src1, 32]
ldr q3, [src2, 32]
eor v0.16b, v0.16b, v1.16b
eor v1.16b, v2.16b, v3.16b
ldr q2, [src1, 48]
ldr q3, [src2, 48]
umaxp v0.16b, v0.16b, v1.16b
ldr q4, [src1, 64]!
ldr q5, [src2, 64]!
eor v1.16b, v2.16b, v3.16b
eor v2.16b, v4.16b, v5.16b
umaxp v1.16b, v1.16b, v2.16b
umaxp v0.16b, v0.16b, v1.16b
umaxp v0.16b, v0.16b, v0.16b
fmov tmp, d0
ccmp tmp, 0, 0, hi
b.eq L(loop64)
L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq L(byte_loop)
sub result, data1w, data2w
ret
L(ret_0):
mov result, 0
/* If equal, process last 1-64 bytes using scalar loop. */
add limit, limit, 64 + 16
cbz tmp, L(last64)
/* Determine the 8-byte aligned offset of the first difference. */
#ifdef __AARCH64EB__
rev16 tmp, tmp
#endif
rev tmp, tmp
clz tmp, tmp
bic tmp, tmp, 7
sub tmp, tmp, 48
ldr data1, [src1, tmp]
ldr data2, [src2, tmp]
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
mov result, 1
cmp data1, data2
cneg result, result, lo
ret
END (memcmp)