From eb88f21301931449718cd4d105018d167a02788b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 11 May 2022 23:28:39 +0200 Subject: [PATCH] gh-89653: PEP 670: Convert unicodeobject.h macros to functions (#92648) Convert the following Unicode macros to static inline functions. Surrogate functions: * Py_UNICODE_IS_SURROGATE() * Py_UNICODE_IS_HIGH_SURROGATE() * Py_UNICODE_IS_LOW_SURROGATE() * Py_UNICODE_HIGH_SURROGATE() * Py_UNICODE_LOW_SURROGATE() * Py_UNICODE_JOIN_SURROGATES() "Is" functions: * Py_UNICODE_ISALNUM() * Py_UNICODE_ISSPACE() In the implementation of these functions, the character type is now well defined to Py_UCS4. --- Include/cpython/unicodeobject.h | 114 ++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 48 deletions(-) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 8e182d0fbf7..4d7173f4213 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -15,53 +15,31 @@ # define USE_UNICODE_WCHAR_CACHE 1 #endif /* USE_UNICODE_WCHAR_CACHE */ -/* Since splitting on whitespace is an important use case, and - whitespace in most situations is solely ASCII whitespace, we - optimize for the common case by using a quick look-up table - _Py_ascii_whitespace (see below) with an inlined check. +// Static inline functions to work with surrogates +static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) { + return (0xD800 <= ch && ch <= 0xDFFF); +} +static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) { + return (0xD800 <= ch && ch <= 0xDBFF); +} +static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) { + return (0xDC00 <= ch && ch <= 0xDFFF); +} - */ -#define Py_UNICODE_ISSPACE(ch) \ - ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) +// Join two surrogate characters and return a single Py_UCS4 value. +static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) { + return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF)); +} -#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) -#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) -#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) -#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) +// High surrogate = top 10 bits added to D800 +static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) { + return (0xD800 - (0x10000 >> 10) + (ch >> 10)); +} -#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) -#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) -#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) - -#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) -#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) -#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) -#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) - -#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) -#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) -#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) - -#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) - -#define Py_UNICODE_ISALNUM(ch) \ - (Py_UNICODE_ISALPHA(ch) || \ - Py_UNICODE_ISDECIMAL(ch) || \ - Py_UNICODE_ISDIGIT(ch) || \ - Py_UNICODE_ISNUMERIC(ch)) - -/* macros to work with surrogates */ -#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) -#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) -#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) -/* Join two surrogate characters and return a single Py_UCS4 value. */ -#define Py_UNICODE_JOIN_SURROGATES(high, low) \ - (((((Py_UCS4)(high) & 0x03FF) << 10) | \ - ((Py_UCS4)(low) & 0x03FF)) + 0x10000) -/* high surrogate = top 10 bits added to D800 */ -#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) -/* low surrogate = bottom 10 bits added to DC00 */ -#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) +// Low surrogate = bottom 10 bits added to DC00 +static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) { + return (0xDC00 + (ch & 0x3FF)); +} /* --- Unicode Type ------------------------------------------------------- */ @@ -1013,10 +991,6 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( /* === Characters Type APIs =============================================== */ -/* Helper array used by Py_UNICODE_ISSPACE(). */ - -PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; - /* These should not be used directly. Use the Py_UNICODE_IS* and Py_UNICODE_TO* macros instead. @@ -1124,6 +1098,50 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +// Helper array used by Py_UNICODE_ISSPACE(). +PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; + +// Since splitting on whitespace is an important use case, and +// whitespace in most situations is solely ASCII whitespace, we +// optimize for the common case by using a quick look-up table +// _Py_ascii_whitespace (see below) with an inlined check. +static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { + if (ch < 128) { + return _Py_ascii_whitespace[ch]; + } + return _PyUnicode_IsWhitespace(ch); +} + +#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) +#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) +#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) +#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) + +#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) +#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) +#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) + +#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) +#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) +#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) +#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) + +#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) +#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) +#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) + +#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) + +static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { + return (Py_UNICODE_ISALPHA(ch) + || Py_UNICODE_ISDECIMAL(ch) + || Py_UNICODE_ISDIGIT(ch) + || Py_UNICODE_ISNUMERIC(ch)); +} + + +/* === Misc functions ===================================================== */ + PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/