Combine control into one character group

Same as with punct, we're currently not interested in distinguishing
between Cc and Cf, so only store their union.
This commit is contained in:
Nikita Popov 2021-08-24 20:38:39 +02:00
parent d0897b3602
commit 425c2e3ba1
3 changed files with 56 additions and 56 deletions

View File

@ -40,39 +40,38 @@
#define UC_ZS 6 /* Separator, Space */
#define UC_ZL 7 /* Separator, Line */
#define UC_ZP 8 /* Separator, Paragraph */
#define UC_CC 9 /* Other, Control */
#define UC_CF 10 /* Other, Format */
#define UC_OS 11 /* Other, Surrogate */
#define UC_CO 12 /* Other, Private Use */
#define UC_CN 13 /* Other, Not Assigned */
#define UC_LU 14 /* Letter, Uppercase */
#define UC_LL 15 /* Letter, Lowercase */
#define UC_LT 16 /* Letter, Titlecase */
#define UC_LM 17 /* Letter, Modifier */
#define UC_LO 18 /* Letter, Other */
#define UC_SM 19 /* Symbol, Math */
#define UC_SC 20 /* Symbol, Currency */
#define UC_SK 21 /* Symbol, Modifier */
#define UC_SO 22 /* Symbol, Other */
#define UC_L 23 /* Left-To-Right */
#define UC_R 24 /* Right-To-Left */
#define UC_EN 25 /* European Number */
#define UC_ES 26 /* European Number Separator */
#define UC_ET 27 /* European Number Terminator */
#define UC_AN 28 /* Arabic Number */
#define UC_CS 29 /* Common Number Separator */
#define UC_B 30 /* Block Separator */
#define UC_S 31 /* Segment Separator */
#define UC_WS 32 /* Whitespace */
#define UC_ON 33 /* Other Neutrals */
#define UC_AL 34 /* Arabic Letter */
#define UC_OS 9 /* Other, Surrogate */
#define UC_CO 10 /* Other, Private Use */
#define UC_CN 11 /* Other, Not Assigned */
#define UC_LU 12 /* Letter, Uppercase */
#define UC_LL 13 /* Letter, Lowercase */
#define UC_LT 14 /* Letter, Titlecase */
#define UC_LM 15 /* Letter, Modifier */
#define UC_LO 16 /* Letter, Other */
#define UC_SM 17 /* Symbol, Math */
#define UC_SC 18 /* Symbol, Currency */
#define UC_SK 19 /* Symbol, Modifier */
#define UC_SO 20 /* Symbol, Other */
#define UC_L 21 /* Left-To-Right */
#define UC_R 22 /* Right-To-Left */
#define UC_EN 23 /* European Number */
#define UC_ES 24 /* European Number Separator */
#define UC_ET 25 /* European Number Terminator */
#define UC_AN 26 /* Arabic Number */
#define UC_CS 27 /* Common Number Separator */
#define UC_B 28 /* Block Separator */
#define UC_S 29 /* Segment Separator */
#define UC_WS 30 /* Whitespace */
#define UC_ON 31 /* Other Neutrals */
#define UC_AL 32 /* Arabic Letter */
/* Merged property categories */
#define UC_P 35
#define UC_C 33
#define UC_P 34
/* Derived properties from DerivedCoreProperties.txt */
#define UC_CASED 36
#define UC_CASE_IGNORABLE 37
#define UC_CASED 35
#define UC_CASE_IGNORABLE 36
MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...);
@ -113,7 +112,7 @@ static inline int php_unicode_is_upper(unsigned long code) {
#define php_unicode_is_alpha(cc) php_unicode_is_prop(cc, UC_LU, UC_LL, UC_LM, UC_LO, UC_LT, -1)
#define php_unicode_is_digit(cc) php_unicode_is_prop1(cc, UC_ND)
#define php_unicode_is_alnum(cc) php_unicode_is_prop(cc, UC_LU, UC_LL, UC_LM, UC_LO, UC_LT, UC_ND, -1)
#define php_unicode_is_cntrl(cc) php_unicode_is_prop(cc, UC_CC, UC_CF, -1)
#define php_unicode_is_cntrl(cc) php_unicode_is_prop1(cc, UC_C)
#define php_unicode_is_blank(cc) php_unicode_is_prop1(cc, UC_ZS)
#define php_unicode_is_punct(cc) php_unicode_is_prop1(cc, UC_P)
#define php_unicode_is_graph(cc) php_unicode_is_prop(cc, \
@ -126,9 +125,6 @@ static inline int php_unicode_is_upper(unsigned long code) {
UC_SM, UC_SM, UC_SC, UC_SK, UC_SO, UC_ZS, -1)
#define php_unicode_is_title(cc) php_unicode_is_prop1(cc, UC_LT)
#define php_unicode_is_isocntrl(cc) php_unicode_is_prop1(cc, UC_CC)
#define php_unicode_is_fmtcntrl(cc) php_unicode_is_prop1(cc, UC_CF)
#define php_unicode_is_symbol(cc) php_unicode_is_prop(cc, UC_SM, UC_SC, UC_SO, UC_SK, -1)
#define php_unicode_is_number(cc) php_unicode_is_prop(cc, UC_ND, UC_NO, UC_NL, -1)
#define php_unicode_is_nonspacing(cc) php_unicode_is_prop1(cc, UC_MN)

View File

@ -101,12 +101,12 @@ class UnicodeData {
*/
$this->propIndexes = array_flip([
"Mn", "Mc", "Me", "Nd", "Nl", "No",
"Zs", "Zl", "Zp", "Cc", "Cf", "Cs",
"Co", "Cn", "Lu", "Ll", "Lt", "Lm",
"Lo", "Sm", "Sc", "Sk", "So", "L",
"R", "EN", "ES", "ET", "AN", "CS",
"B", "S", "WS", "ON", "AL",
"P", "Cased", "Case_Ignorable"
"Zs", "Zl", "Zp", "Cs", "Co", "Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Sm",
"Sc", "Sk", "So", "L", "R", "EN",
"ES", "ET", "AN", "CS", "B", "S",
"WS", "ON", "AL",
"C", "P", "Cased", "Case_Ignorable"
]);
$this->numProps = count($this->propIndexes);
@ -135,6 +135,10 @@ class UnicodeData {
if (in_array($prop, ["Pc", "Pd", "Ps", "Pe", "Po", "Pi", "Pf"])) {
$prop = "P";
}
/* Same for control. */
if (in_array($prop, ["Cc", "Cf"])) {
$prop = "C";
}
if (!isset($this->propIndexes[$prop])) {
throw new Exception("Unknown property $prop");

View File

@ -10,14 +10,14 @@
* the project's page doesn't seem to be live anymore, so you can use
* OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
static const unsigned short _ucprop_size = 38;
static const unsigned short _ucprop_size = 37;
static const unsigned short _ucprop_offsets[] = {
0x0000, 0x028e, 0x03ec, 0x03f6, 0x0470, 0x0488, 0x0516, 0x0524,
0x0526, 0x0528, 0x052c, 0x0554, 0x0556, 0x055c, 0x055c, 0x0a58,
0x0f62, 0x0f76, 0x0ff0, 0x13c2, 0x1442, 0x146c, 0x14a8, 0x1614,
0x1b90, 0x1c1e, 0x1c38, 0x1c4a, 0x1c7a, 0x1c88, 0x1ca2, 0x1cac,
0x1cb2, 0x1cc0, 0x20ae, 0x212a, 0x229c, 0x23b6, 0x26ea, 0x0000
0x0526, 0x0528, 0x052a, 0x0530, 0x0530, 0x0a2c, 0x0f36, 0x0f4a,
0x0fc4, 0x1396, 0x1416, 0x1440, 0x147c, 0x15e8, 0x1b64, 0x1bf2,
0x1c0c, 0x1c1e, 0x1c4e, 0x1c5c, 0x1c76, 0x1c80, 0x1c86, 0x1c94,
0x2082, 0x20fe, 0x212a, 0x229c, 0x23b6, 0x26ea, 0x0000, 0x0000
};
static const unsigned int _ucprop_ranges[] = {
@ -351,17 +351,6 @@ static const unsigned int _ucprop_ranges[] = {
0x00002000, 0x0000200a, 0x0000202f, 0x0000202f,
0x0000205f, 0x0000205f, 0x00003000, 0x00003000,
0x00002028, 0x00002028, 0x00002029, 0x00002029,
0x00000000, 0x0000001f, 0x0000007f, 0x0000009f,
0x000000ad, 0x000000ad, 0x00000600, 0x00000605,
0x0000061c, 0x0000061c, 0x000006dd, 0x000006dd,
0x0000070f, 0x0000070f, 0x000008e2, 0x000008e2,
0x0000180e, 0x0000180e, 0x0000200b, 0x0000200f,
0x0000202a, 0x0000202e, 0x00002060, 0x00002064,
0x00002066, 0x0000206f, 0x0000feff, 0x0000feff,
0x0000fff9, 0x0000fffb, 0x000110bd, 0x000110bd,
0x000110cd, 0x000110cd, 0x00013430, 0x00013438,
0x0001bca0, 0x0001bca3, 0x0001d173, 0x0001d17a,
0x000e0001, 0x000e0001, 0x000e0020, 0x000e007f,
0x0000d800, 0x0000dfff, 0x0000e000, 0x0000f8ff,
0x000f0000, 0x000ffffd, 0x00100000, 0x0010fffd,
0x00000041, 0x0000005a, 0x000000c0, 0x000000d6,
@ -2143,7 +2132,18 @@ static const unsigned int _ucprop_ranges[] = {
0x0001ee79, 0x0001ee7c, 0x0001ee7e, 0x0001ee7e,
0x0001ee80, 0x0001ee89, 0x0001ee8b, 0x0001ee9b,
0x0001eea1, 0x0001eea3, 0x0001eea5, 0x0001eea9,
0x0001eeab, 0x0001eebb, 0x00000021, 0x00000023,
0x0001eeab, 0x0001eebb, 0x00000000, 0x0000001f,
0x0000007f, 0x0000009f, 0x000000ad, 0x000000ad,
0x00000600, 0x00000605, 0x0000061c, 0x0000061c,
0x000006dd, 0x000006dd, 0x0000070f, 0x0000070f,
0x000008e2, 0x000008e2, 0x0000180e, 0x0000180e,
0x0000200b, 0x0000200f, 0x0000202a, 0x0000202e,
0x00002060, 0x00002064, 0x00002066, 0x0000206f,
0x0000feff, 0x0000feff, 0x0000fff9, 0x0000fffb,
0x000110bd, 0x000110bd, 0x000110cd, 0x000110cd,
0x00013430, 0x00013438, 0x0001bca0, 0x0001bca3,
0x0001d173, 0x0001d17a, 0x000e0001, 0x000e0001,
0x000e0020, 0x000e007f, 0x00000021, 0x00000023,
0x00000025, 0x0000002a, 0x0000002c, 0x0000002f,
0x0000003a, 0x0000003b, 0x0000003f, 0x00000040,
0x0000005b, 0x0000005d, 0x0000005f, 0x0000005f,