diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S index 218ea609b9..c872926ba9 100644 --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S @@ -17,14 +17,141 @@ . */ #if IS_IN (libc) -# define __wcschr __wcschr_sse2 - -# undef weak_alias -# define weak_alias(__wcschr, wcschr) -# undef libc_hidden_def -# define libc_hidden_def(__wcschr) -# undef libc_hidden_weak -# define libc_hidden_weak(wcschr) +# ifndef WCSCHR +# define WCSCHR __wcschr_sse2 +# endif #endif -#include "../wcschr.S" +#include + + .text +ENTRY (WCSCHR) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (WCSCHR) diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S index 2131220382..80b12c4286 100644 --- a/sysdeps/x86_64/wcschr.S +++ b/sysdeps/x86_64/wcschr.S @@ -16,140 +16,9 @@ License along with the GNU C Library; if not, see . */ -#include - - .text -ENTRY (__wcschr) - - movd %rsi, %xmm1 - pxor %xmm2, %xmm2 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - punpckldq %xmm1, %xmm1 - - and $63, %rcx - cmp $48, %rcx - ja L(cross_cache) - - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - and $-16, %rdi - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - jmp L(loop) - -L(cross_cache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - - sar %cl, %rdx - sar %cl, %rax - test %rax, %rax - je L(unaligned_no_match) - - bsf %rax, %rax - test %rdx, %rdx - je L(unaligned_match) - bsf %rdx, %rdx - cmp %rdx, %rax - ja L(return_null) - -L(unaligned_match): - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - test %rdx, %rdx - jne L(return_null) - pxor %xmm2, %xmm2 - - add $16, %rdi - - .p2align 4 -/* Loop start on aligned string. */ -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - jmp L(loop) - - .p2align 4 -L(matches): - pmovmskb %xmm2, %rdx - test %rax, %rax - jz L(return_null) - bsf %rax, %rax - test %rdx, %rdx - je L(match) - bsf %rdx, %rcx - cmp %rcx, %rax - ja L(return_null) -L(match): - sub $16, %rdi - add %rdi, %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (__wcschr) +#define WCSCHR __wcschr +#include "multiarch/wcschr-sse2.S" libc_hidden_def(__wcschr) weak_alias (__wcschr, wcschr) libc_hidden_weak (wcschr)