x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
This commit is contained in:
Noah Goldstein 2022-07-12 12:29:07 -07:00
parent 72a48ec0f7
commit 64479f11b7
2 changed files with 138 additions and 142 deletions

View File

@ -17,14 +17,141 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# define __wcschr __wcschr_sse2
# undef weak_alias
# define weak_alias(__wcschr, wcschr)
# undef libc_hidden_def
# define libc_hidden_def(__wcschr)
# undef libc_hidden_weak
# define libc_hidden_weak(wcschr)
# ifndef WCSCHR
# define WCSCHR __wcschr_sse2
# endif
#endif
#include "../wcschr.S"
#include <sysdep.h>
.text
ENTRY (WCSCHR)
movd %rsi, %xmm1
pxor %xmm2, %xmm2
mov %rdi, %rcx
punpckldq %xmm1, %xmm1
punpckldq %xmm1, %xmm1
and $63, %rcx
cmp $48, %rcx
ja L(cross_cache)
movdqu (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
jmp L(loop)
L(cross_cache):
and $15, %rcx
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
sar %cl, %rdx
sar %cl, %rax
test %rax, %rax
je L(unaligned_no_match)
bsf %rax, %rax
test %rdx, %rdx
je L(unaligned_match)
bsf %rdx, %rdx
cmp %rdx, %rax
ja L(return_null)
L(unaligned_match):
add %rdi, %rax
add %rcx, %rax
ret
.p2align 4
L(unaligned_no_match):
test %rdx, %rdx
jne L(return_null)
pxor %xmm2, %xmm2
add $16, %rdi
.p2align 4
/* Loop start on aligned string. */
L(loop):
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
jmp L(loop)
.p2align 4
L(matches):
pmovmskb %xmm2, %rdx
test %rax, %rax
jz L(return_null)
bsf %rax, %rax
test %rdx, %rdx
je L(match)
bsf %rdx, %rcx
cmp %rcx, %rax
ja L(return_null)
L(match):
sub $16, %rdi
add %rdi, %rax
ret
.p2align 4
L(return_null):
xor %rax, %rax
ret
END (WCSCHR)

View File

@ -16,140 +16,9 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
.text
ENTRY (__wcschr)
movd %rsi, %xmm1
pxor %xmm2, %xmm2
mov %rdi, %rcx
punpckldq %xmm1, %xmm1
punpckldq %xmm1, %xmm1
and $63, %rcx
cmp $48, %rcx
ja L(cross_cache)
movdqu (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
jmp L(loop)
L(cross_cache):
and $15, %rcx
and $-16, %rdi
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
sar %cl, %rdx
sar %cl, %rax
test %rax, %rax
je L(unaligned_no_match)
bsf %rax, %rax
test %rdx, %rdx
je L(unaligned_match)
bsf %rdx, %rdx
cmp %rdx, %rax
ja L(return_null)
L(unaligned_match):
add %rdi, %rax
add %rcx, %rax
ret
.p2align 4
L(unaligned_no_match):
test %rdx, %rdx
jne L(return_null)
pxor %xmm2, %xmm2
add $16, %rdi
.p2align 4
/* Loop start on aligned string. */
L(loop):
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
movdqa (%rdi), %xmm0
pcmpeqd %xmm0, %xmm2
add $16, %rdi
pcmpeqd %xmm1, %xmm0
pmovmskb %xmm2, %rdx
pmovmskb %xmm0, %rax
or %rax, %rdx
jnz L(matches)
jmp L(loop)
.p2align 4
L(matches):
pmovmskb %xmm2, %rdx
test %rax, %rax
jz L(return_null)
bsf %rax, %rax
test %rdx, %rdx
je L(match)
bsf %rdx, %rcx
cmp %rcx, %rax
ja L(return_null)
L(match):
sub $16, %rdi
add %rdi, %rax
ret
.p2align 4
L(return_null):
xor %rax, %rax
ret
END (__wcschr)
#define WCSCHR __wcschr
#include "multiarch/wcschr-sse2.S"
libc_hidden_def(__wcschr)
weak_alias (__wcschr, wcschr)
libc_hidden_weak (wcschr)