diff --git a/ChangeLog b/ChangeLog index 25691cd6ec..26429c65c9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2010-02-24 H.J. Lu + + * sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Remove redundant + punpcklbw. + Use unsigned conditional jumps. + (128bytesormore_nt): Renamed to ... + (128bytesormore_endof_L1): This. + Use add instead of lea if possible. + Correct unwind info. + * sysdeps/i386/i686/multiarch/memset-sse2.S: Remove redundant + punpcklbw. + Use unsigned conditional jumps. + Use add instead of lea if possible. + Correct unwind info. + 2010-02-24 Ulrich Drepper [BZ #11319] diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S index 84afffeb66..f9a0b13d0c 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -243,7 +243,6 @@ L(32bytesormore): pxor %xmm0, %xmm0 #else movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 #endif testl $0xf, %edx @@ -261,7 +260,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -293,7 +292,7 @@ L(128bytesormore): * fast string will prefetch and combine data efficiently. */ cmp %edi, %ecx - jae L(128bytesormore_nt) + jae L(128bytesormore_endof_L1) subl $128, %ecx L(128bytesormore_normal): sub $128, %ecx @@ -306,7 +305,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -319,15 +318,16 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): POP (%edi) - lea 128(%ecx), %ecx + add $128, %ecx BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + CFI_PUSH (%edi) ALIGN (4) -L(128bytesormore_nt): +L(128bytesormore_endof_L1): mov %edx, %edi mov %ecx, %edx shr $2, %ecx diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S index b2b979193e..92ad601bf2 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -243,7 +243,6 @@ L(32bytesormore): pxor %xmm0, %xmm0 #else movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 #endif testl $0xf, %edx @@ -261,7 +260,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -287,14 +286,17 @@ L(128bytesormore): #ifdef DATA_CACHE_SIZE POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) cmp $DATA_CACHE_SIZE, %ecx #else # ifdef SHARED +# define RESTORE_EBX_STATE call __i686.get_pc_thunk.bx add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx # else POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) cmp __x86_data_cache_size, %ecx # endif #endif @@ -312,7 +314,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -325,10 +327,10 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): - lea 128(%ecx), %ecx + add $128, %ecx BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) ALIGN (4) @@ -346,11 +348,12 @@ L(128bytes_L2_normal): movaps %xmm0, 0x70(%edx) add $128, %edx cmp $128, %ecx - jge L(128bytes_L2_normal) + jae L(128bytes_L2_normal) L(128bytesless_L2_normal): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + RESTORE_EBX_STATE L(128bytesormore_nt_start): sub %ebx, %ecx ALIGN (4) @@ -368,7 +371,7 @@ L(128bytesormore_shared_cache_loop): movdqa %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ebx - jge L(128bytesormore_shared_cache_loop) + jae L(128bytesormore_shared_cache_loop) cmp $0x80, %ecx jb L(shared_cache_loop_end) ALIGN (4) @@ -384,7 +387,7 @@ L(128bytesormore_nt): movntdq %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ecx - jge L(128bytesormore_nt) + jae L(128bytesormore_nt) sfence L(shared_cache_loop_end): #if defined DATA_CACHE_SIZE || !defined SHARED