mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-27 03:33:33 +08:00
tilegx: work around vector insn bug in gcc
Avoid an issue in gcc where some of the vector (aka SIMD) ops will sometimes end up getting wrongly optimized out. We use these instructions in many of the string implementations. If/when we have an upstreamed fix for this problem in gcc we can conditionalize the use of the extended assembly workaround in glibc.
This commit is contained in:
parent
446d22e91d
commit
f18b8dc7d7
14
ChangeLog
14
ChangeLog
@ -1,3 +1,17 @@
|
||||
2017-12-05 Chris Metcalf <cmetcalf@mellanox.com>
|
||||
|
||||
* sysdeps/tile/tilegx/string-endian.h (VECOP): Provide working
|
||||
replacements for __insn_xxx builtins for v1cmpeq, v1cmpltu,
|
||||
v1cmpne, v1add, v1shru, v1shl (register and immediate versions).
|
||||
* sysdeps/tile/tilegx/memchr.c (__memchr): Use VECOP function
|
||||
instead of __insn__xxx.
|
||||
* sysdeps/tile/tilegx/rawmemchr.c (__rawmemchr): Likewise.
|
||||
* sysdeps/tile/tilegx/strstr.c (strcasechr): Likewise.
|
||||
* sysdeps/tile/tilegx/strrchr.c (strrchr): Likewise.
|
||||
* sysdeps/tile/tilegx/strlen.c (strlen): Likewise.
|
||||
* sysdeps/tile/tilegx/strchrnul.c (__strchrnul): Likewise.
|
||||
* sysdeps/tile/tilegx/strchr.c (strchr): Likewise.
|
||||
|
||||
2017-12-05 Florian Weimer <fweimer@redhat.com>
|
||||
|
||||
Linux: Implement interfaces for memory protection keys
|
||||
|
@ -58,7 +58,7 @@ __memchr (const void *s, int c, size_t n)
|
||||
/* Compute the address of the word containing the last byte. */
|
||||
last_word_ptr = (const uint64_t *) ((uintptr_t) last_byte_ptr & -8);
|
||||
|
||||
while ((bits = __insn_v1cmpeq (v, goal)) == 0)
|
||||
while ((bits = v1cmpeq (v, goal)) == 0)
|
||||
{
|
||||
if (__builtin_expect (p == last_word_ptr, 0))
|
||||
{
|
||||
|
@ -36,7 +36,7 @@ __rawmemchr (const void *s, int c)
|
||||
uint64_t v = (*p | before_mask) ^ (goal & before_mask);
|
||||
|
||||
uint64_t bits;
|
||||
while ((bits = __insn_v1cmpeq (v, goal)) == 0)
|
||||
while ((bits = v1cmpeq (v, goal)) == 0)
|
||||
v = *++p;
|
||||
|
||||
return ((char *) p) + (CFZ (bits) >> 3);
|
||||
|
@ -38,16 +38,16 @@ strchr (const char *s, int c)
|
||||
match neither zero nor goal (we make sure the high bit of each byte
|
||||
is 1, and the low 7 bits are all the opposite of the goal byte). */
|
||||
const uint64_t before_mask = MASK (s_int);
|
||||
uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui (before_mask, 1));
|
||||
uint64_t v = (*p | before_mask) ^ (goal & v1shrui (before_mask, 1));
|
||||
|
||||
uint64_t zero_matches, goal_matches;
|
||||
while (1)
|
||||
{
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_v1cmpeqi (v, 0);
|
||||
zero_matches = v1cmpeqi (v, 0);
|
||||
|
||||
/* Look for the goal byte. */
|
||||
goal_matches = __insn_v1cmpeq (v, goal);
|
||||
goal_matches = v1cmpeq (v, goal);
|
||||
|
||||
if (__builtin_expect ((zero_matches | goal_matches) != 0, 0))
|
||||
break;
|
||||
|
@ -36,16 +36,16 @@ __strchrnul (const char *s, int c)
|
||||
match neither zero nor goal (we make sure the high bit of each byte
|
||||
is 1, and the low 7 bits are all the opposite of the goal byte). */
|
||||
const uint64_t before_mask = MASK (s_int);
|
||||
uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui (before_mask, 1));
|
||||
uint64_t v = (*p | before_mask) ^ (goal & v1shrui (before_mask, 1));
|
||||
|
||||
uint64_t zero_matches, goal_matches;
|
||||
while (1)
|
||||
{
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_v1cmpeqi (v, 0);
|
||||
zero_matches = v1cmpeqi (v, 0);
|
||||
|
||||
/* Look for the goal byte. */
|
||||
goal_matches = __insn_v1cmpeq (v, goal);
|
||||
goal_matches = v1cmpeq (v, goal);
|
||||
|
||||
if (__builtin_expect ((zero_matches | goal_matches) != 0, 0))
|
||||
break;
|
||||
|
@ -56,3 +56,28 @@ static inline uint64_t copy_byte(uint8_t byte)
|
||||
{
|
||||
return __insn_shufflebytes(byte, 0, 0);
|
||||
}
|
||||
|
||||
/* Implement the byte vector instructions using extended assembly.
|
||||
The __insn_OP() builtins are buggy in current compiler versions. */
|
||||
|
||||
#define VECOP(OP) \
|
||||
static inline uint64_t OP (uint64_t a, uint64_t b) \
|
||||
{ \
|
||||
uint64_t result; \
|
||||
asm volatile (#OP " %0, %1, %2" : "=r"(result) : "r"(a), "r"(b)); \
|
||||
return result; \
|
||||
} \
|
||||
\
|
||||
static inline uint64_t OP ## i (uint64_t a, uint64_t b) \
|
||||
{ \
|
||||
uint64_t result; \
|
||||
asm volatile (#OP "i %0, %1, %2" : "=r"(result) : "r"(a), "I"(b)); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
VECOP(v1cmpeq)
|
||||
VECOP(v1cmpltu)
|
||||
VECOP(v1cmpne)
|
||||
VECOP(v1add)
|
||||
VECOP(v1shru)
|
||||
VECOP(v1shl)
|
||||
|
@ -31,7 +31,7 @@ strlen (const char *s)
|
||||
uint64_t v = *p | MASK (s_int);
|
||||
|
||||
uint64_t bits;
|
||||
while ((bits = __insn_v1cmpeqi (v, 0)) == 0)
|
||||
while ((bits = v1cmpeqi (v, 0)) == 0)
|
||||
v = *++p;
|
||||
|
||||
return ((const char *) p) + (CFZ (bits) >> 3) - s;
|
||||
|
@ -37,7 +37,7 @@ __strnlen (const char *s, size_t maxlen)
|
||||
uint64_t v = *p | MASK (s_int);
|
||||
|
||||
uint64_t bits;
|
||||
while ((bits = __insn_v1cmpeqi (v, 0)) == 0)
|
||||
while ((bits = v1cmpeqi (v, 0)) == 0)
|
||||
{
|
||||
if (bytes_read >= maxlen)
|
||||
{
|
||||
|
@ -34,16 +34,16 @@ strrchr (const char *s, int c)
|
||||
match neither zero nor goal (we make sure the high bit of each byte
|
||||
is 1, and the low 7 bits are all the opposite of the goal byte). */
|
||||
const uint64_t before_mask = MASK (s_int);
|
||||
uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui (before_mask, 1));
|
||||
uint64_t v = (*p | before_mask) ^ (goal & v1shrui (before_mask, 1));
|
||||
const char *found = NULL;
|
||||
uint64_t zero_matches, goal_matches;
|
||||
while (1)
|
||||
{
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_v1cmpeqi (v, 0);
|
||||
zero_matches = v1cmpeqi (v, 0);
|
||||
|
||||
/* Look for the goal byte. */
|
||||
goal_matches = __insn_v1cmpeq (v, goal);
|
||||
goal_matches = v1cmpeq (v, goal);
|
||||
|
||||
/* If we found the goal, record the last offset. */
|
||||
if (__builtin_expect (goal_matches != 0, 0))
|
||||
|
@ -57,10 +57,10 @@ static uint64_t
|
||||
vec_tolower (uint64_t cc)
|
||||
{
|
||||
/* For Uppercases letters, add 32 to convert to lower case. */
|
||||
uint64_t less_than_eq_Z = __insn_v1cmpltui (cc, 'Z' + 1);
|
||||
uint64_t less_than_A = __insn_v1cmpltui (cc, 'A');
|
||||
uint64_t is_upper = __insn_v1cmpne (less_than_eq_Z, less_than_A);
|
||||
return __insn_v1add (cc,__insn_v1shli (is_upper, 5));
|
||||
uint64_t less_than_eq_Z = v1cmpltui (cc, 'Z' + 1);
|
||||
uint64_t less_than_A = v1cmpltui (cc, 'A');
|
||||
uint64_t is_upper = v1cmpne (less_than_eq_Z, less_than_A);
|
||||
return v1add (cc, v1shli (is_upper, 5));
|
||||
}
|
||||
|
||||
/* There is no strcasechr() defined, but needed for 1 byte case
|
||||
@ -85,16 +85,16 @@ strcasechr (const char *s, int c)
|
||||
is 1, and the low 7 bits are all the opposite of the goal byte). */
|
||||
const uint64_t before_mask = MASK (s_int);
|
||||
uint64_t v =
|
||||
(vec_tolower (*p) | before_mask) ^ (goal & __insn_v1shrui (before_mask, 1));
|
||||
(vec_tolower (*p) | before_mask) ^ (goal & v1shrui (before_mask, 1));
|
||||
|
||||
uint64_t zero_matches, goal_matches;
|
||||
while (1)
|
||||
{
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_v1cmpeqi (v, 0);
|
||||
zero_matches = v1cmpeqi (v, 0);
|
||||
|
||||
/* Look for the goal byte. */
|
||||
goal_matches = __insn_v1cmpeq (v, goal);
|
||||
goal_matches = v1cmpeq (v, goal);
|
||||
|
||||
if (__builtin_expect ((zero_matches | goal_matches) != 0, 0))
|
||||
break;
|
||||
@ -146,14 +146,14 @@ STRSTR2 (const char *haystack_start, const char *needle)
|
||||
is 1, and the low 7 bits are all the opposite of the goal byte). */
|
||||
const uint64_t before_mask = MASK (s_int);
|
||||
uint64_t v =
|
||||
(vec_load (p) | before_mask) ^ (byte1 & __insn_v1shrui (before_mask, 1));
|
||||
(vec_load (p) | before_mask) ^ (byte1 & v1shrui (before_mask, 1));
|
||||
|
||||
uint64_t zero_matches, goal_matches;
|
||||
while (1)
|
||||
{
|
||||
/* Look for a terminating '\0'. */
|
||||
zero_matches = __insn_v1cmpeqi (v, 0);
|
||||
uint64_t byte1_matches = __insn_v1cmpeq (v, byte1);
|
||||
zero_matches = v1cmpeqi (v, 0);
|
||||
uint64_t byte1_matches = v1cmpeq (v, byte1);
|
||||
if (__builtin_expect (zero_matches != 0, 0))
|
||||
{
|
||||
/* This is the last vector. Don't worry about matches
|
||||
@ -161,7 +161,7 @@ STRSTR2 (const char *haystack_start, const char *needle)
|
||||
back 1 byte to align it with the first byte, then and to
|
||||
check for both matching. Each vector has a 1 in the LSB
|
||||
of the byte if there was match. */
|
||||
uint64_t byte2_matches = __insn_v1cmpeq (v, byte2);
|
||||
uint64_t byte2_matches = v1cmpeq (v, byte2);
|
||||
goal_matches = byte1_matches & STRSHIFT (byte2_matches, 8);
|
||||
break;
|
||||
}
|
||||
@ -175,7 +175,7 @@ STRSTR2 (const char *haystack_start, const char *needle)
|
||||
{
|
||||
/* 8-bytes starting 1 byte into v. */
|
||||
v = __insn_dblalign (v, v2, (void*)1);
|
||||
uint64_t byte2_matches_shifted = __insn_v1cmpeq (v, byte2);
|
||||
uint64_t byte2_matches_shifted = v1cmpeq (v, byte2);
|
||||
goal_matches = byte1_matches & byte2_matches_shifted;
|
||||
if (__builtin_expect (goal_matches != 0, 0))
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user