powerpc: Optimized stpcpy for POWER9

Add stpcpy support to the POWER9 strcpy. This is up to 40% faster on
small strings and up to 90% faster on long relatively unaligned strings,
compared to the POWER8 version. A few examples:

                                        __stpcpy_power9  __stpcpy_power8
Length   20, alignments in bytes  4/ 4:  2.58246          4.8788
Length 1024, alignments in bytes  1/ 6: 24.8186          47.8528
This commit is contained in:
Anton Blanchard via Libc-alpha 2020-05-14 09:08:35 +10:00 committed by Paul E. Murphy
parent 3903704850
commit aa70d05632
6 changed files with 123 additions and 21 deletions

View File

@ -0,0 +1,24 @@
/* Optimized stpcpy implementation for PowerPC64/POWER9.
Copyright (C) 2015-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define USE_AS_STPCPY
#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
weak_alias (__stpcpy, stpcpy)
libc_hidden_def (__stpcpy)
libc_hidden_builtin_def (stpcpy)

View File

@ -18,19 +18,35 @@
#include <sysdep.h>
#ifndef STRCPY
# define STRCPY strcpy
#endif
#ifdef USE_AS_STPCPY
# ifndef STPCPY
# define FUNC_NAME __stpcpy
# else
# define FUNC_NAME STPCPY
# endif
#else
# ifndef STRCPY
# define FUNC_NAME strcpy
# else
# define FUNC_NAME STRCPY
# endif
#endif /* !USE_AS_STPCPY */
/* Implements the function
char * [r3] strcpy (char *dest [r3], const char *src [r4])
or
char * [r3] stpcpy (char *dest [r3], const char *src [r4])
if USE_AS_STPCPY is defined.
The implementation can load bytes past a null terminator, but only
up to the next 16B boundary, so it never crosses a page. */
.machine power9
ENTRY_TOCLESS (STRCPY, 4)
ENTRY_TOCLESS (FUNC_NAME, 4)
CALL_MCOUNT 2
/* NULL string optimisation */
@ -53,8 +69,8 @@ ENTRY_TOCLESS (STRCPY, 4)
vperm v0,v18,v0,v1
vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
vctzlsbb r8,v6 /* Number of trailing zeroes */
addi r8,r8,1 /* Add null terminator */
vctzlsbb r7,v6 /* Number of trailing zeroes */
addi r8,r7,1 /* Add null terminator */
/* r8 = bytes including null
r9 = bytes to get source 16B aligned
@ -68,6 +84,11 @@ ENTRY_TOCLESS (STRCPY, 4)
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
add r3,r11,r7
#endif
blr
L(no_null):
@ -106,28 +127,43 @@ L(loop):
L(tail1):
vctzlsbb r8,v6
addi r8,r8,1
sldi r9,r8,56 /* stxvl wants size in top 8 bits */
addi r9,r8,1
sldi r9,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r9
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
add r3,r11,r8
#endif
blr
L(tail2):
stxv 32+v0,0(r11)
vctzlsbb r8,v6 /* Number of trailing zeroes */
addi r8,r8,1 /* Add null terminator */
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16
stxvl 32+v1,r11,r10 /* Partial store */
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
add r3,r11,r8
#endif
blr
L(tail3):
stxv 32+v0,0(r11)
stxv 32+v1,16(r11)
vctzlsbb r8,v6 /* Number of trailing zeroes */
addi r8,r8,1 /* Add null terminator */
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32
stxvl 32+v2,r11,r10 /* Partial store */
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
add r3,r11,r8
#endif
blr
L(tail4):
@ -135,10 +171,17 @@ L(tail4):
stxv 32+v1,16(r11)
stxv 32+v2,32(r11)
vctzlsbb r8,v6 /* Number of trailing zeroes */
addi r8,r8,1 /* Add null terminator */
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48
stxvl 32+v3,r11,r10 /* Partial store */
#ifdef USE_AS_STPCPY
/* stpcpy returns the dest address plus the size not counting the
final '\0'. */
add r3,r11,r8
#endif
blr
END (STRCPY)
END (FUNC_NAME)
#ifndef USE_AS_STPCPY
libc_hidden_builtin_def (strcpy)
#endif

View File

@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops

View File

@ -98,6 +98,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
#ifdef __LITTLE_ENDIAN__
IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00,
__stpcpy_power9)
#endif
IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpcpy_power8)
IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,

View File

@ -0,0 +1,24 @@
/* Optimized stpcpy implementation for POWER9/PPC64.
Copyright (C) 2015-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define STPCPY __stpcpy_power9
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
#include <sysdeps/powerpc/powerpc64/le/power9/stpcpy.S>

View File

@ -26,13 +26,20 @@
extern __typeof (__stpcpy) __stpcpy_ppc attribute_hidden;
extern __typeof (__stpcpy) __stpcpy_power7 attribute_hidden;
extern __typeof (__stpcpy) __stpcpy_power8 attribute_hidden;
# ifdef __LITTLE_ENDIAN__
extern __typeof (__stpcpy) __stpcpy_power9 attribute_hidden;
# endif
libc_ifunc_hidden (__stpcpy, __stpcpy,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpcpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
? __stpcpy_power7
: __stpcpy_ppc);
# ifdef __LITTLE_ENDIAN__
(hwcap2 & PPC_FEATURE2_ARCH_3_00)
? __stpcpy_power9 :
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpcpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
? __stpcpy_power7
: __stpcpy_ppc);
weak_alias (__stpcpy, stpcpy)
libc_hidden_def (__stpcpy)