mirror of
https://github.com/python/cpython.git
synced 2024-12-13 20:05:53 +08:00
dec0757549
* Remove m68k-specific hack from ascii_decode On m68k, alignments of primitives is more relaxed, with 4-byte and 8-byte types only requiring 2-byte alignment, thus using sizeof(size_t) does not work. Instead, use the portable alternative. Note that this is a minimal fix that only relaxes the assertion and the condition for when to use the optimised version remains overly strict. Such issues will be fixed tree-wide in the next commit. NB: In C11 we could use _Alignof(size_t) instead, but for compatibility we use autoconf. * Optimise string routines for architectures with non-natural alignment C only requires that sizeof(x) is a multiple of alignof(x), not that the two are equal. Thus anywhere where we optimise based on alignment we should be using alignof(x) not sizeof(x). This is more annoying than it would be in C11 where we could just use _Alignof(x) (and alignof(x) in C++11), but since we still require only C99 we must plumb the information all the way from autoconf through the various typedefs and defines.
133 lines
3.6 KiB
C
133 lines
3.6 KiB
C
/* Finding the optimal width of unicode characters in a buffer */
|
|
|
|
#if !STRINGLIB_IS_UNICODE
|
|
# error "find_max_char.h is specific to Unicode"
|
|
#endif
|
|
|
|
/* Mask to quickly check whether a C 'size_t' contains a
|
|
non-ASCII, UTF8-encoded char. */
|
|
#if (SIZEOF_SIZE_T == 8)
|
|
# define UCS1_ASCII_CHAR_MASK 0x8080808080808080ULL
|
|
#elif (SIZEOF_SIZE_T == 4)
|
|
# define UCS1_ASCII_CHAR_MASK 0x80808080U
|
|
#else
|
|
# error C 'size_t' size should be either 4 or 8!
|
|
#endif
|
|
|
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
|
|
Py_LOCAL_INLINE(Py_UCS4)
|
|
STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
|
|
{
|
|
const unsigned char *p = (const unsigned char *) begin;
|
|
|
|
while (p < end) {
|
|
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
|
|
/* Help register allocation */
|
|
const unsigned char *_p = p;
|
|
while (_p + SIZEOF_SIZE_T <= end) {
|
|
size_t value = *(const size_t *) _p;
|
|
if (value & UCS1_ASCII_CHAR_MASK)
|
|
return 255;
|
|
_p += SIZEOF_SIZE_T;
|
|
}
|
|
p = _p;
|
|
if (p == end)
|
|
break;
|
|
}
|
|
if (*p++ & 0x80)
|
|
return 255;
|
|
}
|
|
return 127;
|
|
}
|
|
|
|
#undef ASCII_CHAR_MASK
|
|
|
|
#else /* STRINGLIB_SIZEOF_CHAR == 1 */
|
|
|
|
#define MASK_ASCII 0xFFFFFF80
|
|
#define MASK_UCS1 0xFFFFFF00
|
|
#define MASK_UCS2 0xFFFF0000
|
|
|
|
#define MAX_CHAR_ASCII 0x7f
|
|
#define MAX_CHAR_UCS1 0xff
|
|
#define MAX_CHAR_UCS2 0xffff
|
|
#define MAX_CHAR_UCS4 0x10ffff
|
|
|
|
Py_LOCAL_INLINE(Py_UCS4)
|
|
STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
|
|
{
|
|
#if STRINGLIB_SIZEOF_CHAR == 2
|
|
const Py_UCS4 mask_limit = MASK_UCS1;
|
|
const Py_UCS4 max_char_limit = MAX_CHAR_UCS2;
|
|
#elif STRINGLIB_SIZEOF_CHAR == 4
|
|
const Py_UCS4 mask_limit = MASK_UCS2;
|
|
const Py_UCS4 max_char_limit = MAX_CHAR_UCS4;
|
|
#else
|
|
#error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4)
|
|
#endif
|
|
Py_UCS4 mask;
|
|
Py_ssize_t n = end - begin;
|
|
const STRINGLIB_CHAR *p = begin;
|
|
const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4);
|
|
Py_UCS4 max_char;
|
|
|
|
max_char = MAX_CHAR_ASCII;
|
|
mask = MASK_ASCII;
|
|
while (p < unrolled_end) {
|
|
STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3];
|
|
if (bits & mask) {
|
|
if (mask == mask_limit) {
|
|
/* Limit reached */
|
|
return max_char_limit;
|
|
}
|
|
if (mask == MASK_ASCII) {
|
|
max_char = MAX_CHAR_UCS1;
|
|
mask = MASK_UCS1;
|
|
}
|
|
else {
|
|
/* mask can't be MASK_UCS2 because of mask_limit above */
|
|
assert(mask == MASK_UCS1);
|
|
max_char = MAX_CHAR_UCS2;
|
|
mask = MASK_UCS2;
|
|
}
|
|
/* We check the new mask on the same chars in the next iteration */
|
|
continue;
|
|
}
|
|
p += 4;
|
|
}
|
|
while (p < end) {
|
|
if (p[0] & mask) {
|
|
if (mask == mask_limit) {
|
|
/* Limit reached */
|
|
return max_char_limit;
|
|
}
|
|
if (mask == MASK_ASCII) {
|
|
max_char = MAX_CHAR_UCS1;
|
|
mask = MASK_UCS1;
|
|
}
|
|
else {
|
|
/* mask can't be MASK_UCS2 because of mask_limit above */
|
|
assert(mask == MASK_UCS1);
|
|
max_char = MAX_CHAR_UCS2;
|
|
mask = MASK_UCS2;
|
|
}
|
|
/* We check the new mask on the same chars in the next iteration */
|
|
continue;
|
|
}
|
|
p++;
|
|
}
|
|
return max_char;
|
|
}
|
|
|
|
#undef MASK_ASCII
|
|
#undef MASK_UCS1
|
|
#undef MASK_UCS2
|
|
#undef MAX_CHAR_ASCII
|
|
#undef MAX_CHAR_UCS1
|
|
#undef MAX_CHAR_UCS2
|
|
#undef MAX_CHAR_UCS4
|
|
|
|
#endif /* STRINGLIB_SIZEOF_CHAR == 1 */
|
|
|