Add fast mb_strcut implementation for UTF-8

The old implementation runs through the entire string to pick out the
part which should be returned by mb_strcut. This creates significant
performance overhead. The new specialized implementation of mb_strcut
for UTF-8 usually only examines a few bytes around the starting and
ending cut points, meaning it generally runs in constant time.

For UTF-8 strings just a few bytes long, the new implementation is
around 10% faster (according to microbenchmarks which I ran locally).
For strings around 10,000 bytes in length, it is 50-300x faster.
(Yes, that is 300x and not 300%.)

The new implementation behaves identically to the old one on VALID
UTF-8 strings; a fuzzer was used to help ensure this is the case.
On invalid UTF-8 strings, there is a difference: in some cases, the
old implementation will pass invalid byte sequences through unchanged,
while in others it will remove them. The new implementation has
behavior which is perhaps slightly more predictable: it simply backs
up the starting and ending cut points to the preceding "starter
byte" (one which is not a UTF-8 continuation byte).
This commit is contained in:
Alex Dowad 2023-09-30 17:54:56 +02:00
parent 3fa836f711
commit 1f0cf133db
20 changed files with 151 additions and 70 deletions

View File

@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_7bit = {
&vtbl_wchar_7bit,
mb_7bit_to_wchar,
mb_wchar_to_7bit,
NULL
NULL,
NULL,
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)

View File

@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
NULL,
mb_base64_to_wchar,
mb_wchar_to_base64,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_8bit_b64 = {

View File

@ -4392,7 +4392,8 @@ const mbfl_encoding mbfl_encoding_jis = {
&vtbl_wchar_jis,
mb_iso2022jp_to_wchar,
mb_wchar_to_jis,
mb_check_jis
mb_check_jis,
NULL,
};
static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
@ -4426,7 +4427,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
&vtbl_wchar_2022jp,
mb_iso2022jp_to_wchar,
mb_wchar_to_iso2022jp,
mb_check_iso2022jp
mb_check_iso2022jp,
NULL,
};
static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
@ -4462,7 +4464,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
&vtbl_wchar_2022jp_kddi,
mb_iso2022jp_kddi_to_wchar,
mb_wchar_to_iso2022jp_kddi,
NULL
NULL,
NULL,
};
static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
@ -4496,7 +4499,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
&vtbl_wchar_2022jp_2004,
mb_iso2022jp2004_to_wchar,
mb_wchar_to_iso2022jp2004,
NULL
NULL,
NULL,
};
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
@ -4581,7 +4585,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
&vtbl_wchar_cp50220,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50220,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_cp50221 = {
@ -4595,7 +4600,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
&vtbl_wchar_cp50221,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50221,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_cp50222 = {
@ -4609,7 +4615,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
&vtbl_wchar_cp50222,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50222,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
@ -4645,7 +4652,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
&vtbl_wchar_2022jpms,
mb_iso2022jpms_to_wchar,
mb_wchar_to_iso2022jpms,
NULL
NULL,
NULL,
};
/* ISO-2022-KR is defined in RFC 1557
@ -4687,7 +4695,8 @@ const mbfl_encoding mbfl_encoding_2022kr = {
&vtbl_wchar_2022kr,
mb_iso2022kr_to_wchar,
mb_wchar_to_iso2022kr,
NULL
NULL,
NULL,
};
/*
@ -7832,7 +7841,8 @@ const mbfl_encoding mbfl_encoding_sjis = {
&vtbl_wchar_sjis,
mb_sjis_to_wchar,
mb_wchar_to_sjis,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
@ -7868,7 +7878,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
&vtbl_wchar_sjis_mac,
mb_sjismac_to_wchar,
mb_wchar_to_sjismac,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
@ -7906,7 +7917,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
&vtbl_wchar_sjis_docomo,
mb_sjis_docomo_to_wchar,
mb_wchar_to_sjis_docomo,
NULL
NULL,
NULL,
};
static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
@ -7940,7 +7952,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
&vtbl_wchar_sjis_kddi,
mb_sjis_kddi_to_wchar,
mb_wchar_to_sjis_kddi,
NULL
NULL,
NULL,
};
static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
@ -7974,7 +7987,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
&vtbl_wchar_sjis_sb,
mb_sjis_sb_to_wchar,
mb_wchar_to_sjis_sb,
NULL
NULL,
NULL,
};
/* Although the specification for Shift-JIS-2004 indicates that 0x5C and
@ -8017,7 +8031,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
&vtbl_wchar_sjis2004,
mb_sjis2004_to_wchar,
mb_wchar_to_sjis2004,
NULL
NULL,
NULL,
};
/* CP932 is Microsoft's version of Shift-JIS.
@ -8103,7 +8118,8 @@ const mbfl_encoding mbfl_encoding_cp932 = {
&vtbl_wchar_cp932,
mb_cp932_to_wchar,
mb_wchar_to_cp932,
NULL
NULL,
NULL,
};
static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
@ -8137,7 +8153,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
&vtbl_wchar_sjiswin,
mb_cp932_to_wchar,
mb_wchar_to_sjiswin,
NULL
NULL,
NULL,
};
/*
@ -10346,7 +10363,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
&vtbl_wchar_eucjp,
mb_eucjp_to_wchar,
mb_wchar_to_eucjp,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
@ -10382,7 +10400,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
&vtbl_wchar_eucjp2004,
mb_eucjp2004_to_wchar,
mb_wchar_to_eucjp2004,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
@ -10418,7 +10437,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
&vtbl_wchar_eucjpwin,
mb_eucjpwin_to_wchar,
mb_wchar_to_eucjpwin,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
@ -10454,7 +10474,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
&vtbl_wchar_cp51932,
mb_cp51932_to_wchar,
mb_wchar_to_cp51932,
NULL
NULL,
NULL,
};
static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
@ -10509,7 +10530,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
&vtbl_wchar_euccn,
mb_euccn_to_wchar,
mb_wchar_to_euccn,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
@ -10545,7 +10567,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
&vtbl_wchar_euctw,
mb_euctw_to_wchar,
mb_wchar_to_euctw,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
@ -10581,7 +10604,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
&vtbl_wchar_euckr,
mb_euckr_to_wchar,
mb_wchar_to_euckr,
NULL
NULL,
NULL,
};
/* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
@ -10640,7 +10664,8 @@ const mbfl_encoding mbfl_encoding_uhc = {
&vtbl_wchar_uhc,
mb_uhc_to_wchar,
mb_wchar_to_uhc,
NULL
NULL,
NULL,
};
/*
@ -11555,7 +11580,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
&vtbl_wchar_gb18030,
mb_gb18030_to_wchar,
mb_wchar_to_gb18030,
NULL
NULL,
NULL,
};
static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
@ -11591,7 +11617,8 @@ const mbfl_encoding mbfl_encoding_cp936 = {
&vtbl_wchar_cp936,
mb_cp936_to_wchar,
mb_wchar_to_cp936,
NULL
NULL,
NULL,
};
/*
@ -12160,7 +12187,8 @@ const mbfl_encoding mbfl_encoding_big5 = {
&vtbl_wchar_big5,
mb_big5_to_wchar,
mb_wchar_to_big5,
NULL
NULL,
NULL,
};
static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
@ -12194,7 +12222,8 @@ const mbfl_encoding mbfl_encoding_cp950 = {
&vtbl_wchar_cp950,
mb_cp950_to_wchar,
mb_wchar_to_cp950,
NULL
NULL,
NULL,
};
/*
@ -12567,5 +12596,6 @@ const mbfl_encoding mbfl_encoding_hz = {
&vtbl_wchar_hz,
mb_hz_to_wchar,
mb_wchar_to_hz,
NULL
NULL,
NULL,
};

View File

@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
&vtbl_wchar_html,
mb_htmlent_to_wchar,
mb_wchar_to_htmlent,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_wchar_html = {

View File

@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_qprint = {
NULL,
mb_qprint_to_wchar,
mb_wchar_to_qprint,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_8bit_qprint = {

View File

@ -87,6 +87,7 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
&vtbl_wchar_##id, \
mb_##id##_to_wchar, \
mb_wchar_to_##id, \
NULL, \
NULL \
}

View File

@ -57,7 +57,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
&vtbl_wchar_ucs2,
mb_ucs2_to_wchar,
mb_wchar_to_ucs2be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_ucs2be = {
@ -71,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
&vtbl_wchar_ucs2be,
mb_ucs2be_to_wchar,
mb_wchar_to_ucs2be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_ucs2le = {
@ -85,7 +87,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
&vtbl_wchar_ucs2le,
mb_ucs2le_to_wchar,
mb_wchar_to_ucs2le,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {

View File

@ -57,7 +57,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
&vtbl_wchar_ucs4,
mb_ucs4_to_wchar,
mb_wchar_to_ucs4be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_ucs4be = {
@ -71,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
&vtbl_wchar_ucs4be,
mb_ucs4be_to_wchar,
mb_wchar_to_ucs4be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_ucs4le = {
@ -85,7 +87,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
&vtbl_wchar_ucs4le,
mb_ucs4le_to_wchar,
mb_wchar_to_ucs4le,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_ucs4_wchar = {

View File

@ -189,7 +189,8 @@ const mbfl_encoding mbfl_encoding_utf16 = {
&vtbl_wchar_utf16,
mb_utf16_to_wchar,
mb_wchar_to_utf16be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf16be = {
@ -203,7 +204,8 @@ const mbfl_encoding mbfl_encoding_utf16be = {
&vtbl_wchar_utf16be,
mb_utf16be_to_wchar,
mb_wchar_to_utf16be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf16le = {
@ -217,7 +219,8 @@ const mbfl_encoding mbfl_encoding_utf16le = {
&vtbl_wchar_utf16le,
mb_utf16le_to_wchar,
mb_wchar_to_utf16le,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {

View File

@ -50,7 +50,8 @@ const mbfl_encoding mbfl_encoding_utf32 = {
&vtbl_wchar_utf32,
mb_utf32_to_wchar,
mb_wchar_to_utf32be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf32be = {
@ -64,7 +65,8 @@ const mbfl_encoding mbfl_encoding_utf32be = {
&vtbl_wchar_utf32be,
mb_utf32be_to_wchar,
mb_wchar_to_utf32be,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf32le = {
@ -78,7 +80,8 @@ const mbfl_encoding mbfl_encoding_utf32le = {
&vtbl_wchar_utf32le,
mb_utf32le_to_wchar,
mb_wchar_to_utf32le,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {

View File

@ -62,7 +62,8 @@ const mbfl_encoding mbfl_encoding_utf7 = {
&vtbl_wchar_utf7,
mb_utf7_to_wchar,
mb_wchar_to_utf7,
mb_check_utf7
mb_check_utf7,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf7_wchar = {

View File

@ -98,7 +98,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = {
&vtbl_wchar_utf7imap,
mb_utf7imap_to_wchar,
mb_wchar_to_utf7imap,
mb_check_utf7imap
mb_check_utf7imap,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {

View File

@ -51,6 +51,7 @@ const unsigned char mblen_table_utf8[] = {
static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end);
static const char *mbfl_encoding_utf8_aliases[] = {"utf8", NULL};
@ -65,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf8 = {
&vtbl_wchar_utf8,
mb_utf8_to_wchar,
mb_wchar_to_utf8,
NULL
NULL,
mb_cut_utf8
};
const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
@ -335,3 +337,21 @@ static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool
MB_CONVERT_BUF_STORE(buf, out, limit);
}
static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
unsigned char *start = str + from;
/* Byte values less than -64 are UTF-8 continuation bytes, that is,
* the 2nd, 3rd, or 4th byte of a multi-byte character */
while (start > str && ((signed char)*start) < -64) {
start--;
}
unsigned char *_end = start + len;
if (_end >= end) {
return zend_string_init_fast((char*)start, end - start);
}
while (_end > start && ((signed char)*_end) < -64) {
_end--;
}
return zend_string_init_fast((char*)start, _end - start);
}

View File

@ -124,7 +124,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = {
&vtbl_wchar_utf8_docomo,
mb_utf8_docomo_to_wchar,
mb_wchar_to_utf8_docomo,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
@ -138,7 +139,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
&vtbl_wchar_utf8_kddi_a,
mb_utf8_kddi_a_to_wchar,
mb_wchar_to_utf8_kddi_a,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
@ -152,7 +154,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
&vtbl_wchar_utf8_kddi_b,
mb_utf8_kddi_b_to_wchar,
mb_wchar_to_utf8_kddi_b,
NULL
NULL,
NULL,
};
const mbfl_encoding mbfl_encoding_utf8_sb = {
@ -166,7 +169,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = {
&vtbl_wchar_utf8_sb,
mb_utf8_sb_to_wchar,
mb_wchar_to_utf8_sb,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {

View File

@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_uuencode = {
NULL,
mb_uuencode_to_wchar,
mb_wchar_to_uuencode,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_uuencode_8bit = {

View File

@ -52,7 +52,8 @@ const mbfl_encoding mbfl_encoding_8bit = {
&vtbl_wchar_8bit,
mb_8bit_to_wchar,
mb_wchar_to_8bit,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_8bit_wchar = {

View File

@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_pass = {
NULL,
NULL,
NULL,
NULL
NULL,
NULL,
};
const struct mbfl_convert_vtbl vtbl_pass = {

View File

@ -43,5 +43,6 @@ const mbfl_encoding mbfl_encoding_wchar = {
NULL,
NULL,
NULL,
NULL
NULL,
NULL,
};

View File

@ -145,6 +145,7 @@ typedef struct {
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
typedef zend_string* (*mb_cut_fn)(unsigned char *str, size_t from, size_t len, unsigned char *end);
/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
* the buffer must be at least this size (to work with all supported text encodings) */
@ -251,6 +252,7 @@ typedef struct {
mb_to_wchar_fn to_wchar;
mb_from_wchar_fn from_wchar;
mb_check_fn check;
mb_cut_fn cut;
} mbfl_encoding;
extern const mbfl_encoding mbfl_encoding_utf8;

View File

@ -2403,19 +2403,20 @@ PHP_FUNCTION(mb_strcut)
Z_PARAM_STR_OR_NULL(encoding)
ZEND_PARSE_PARAMETERS_END();
string.val = (unsigned char*)string_val;
string.encoding = php_mb_get_encoding(encoding, 4);
if (!string.encoding) {
const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
if (!enc) {
RETURN_THROWS();
}
string.val = (unsigned char*)string_val;
string.encoding = enc;
if (len_is_null) {
len = string.len;
}
/* if "from" position is negative, count start position from the end
* of the string
*/
* of the string */
if (from < 0) {
from = string.len + from;
if (from < 0) {
@ -2424,8 +2425,7 @@ PHP_FUNCTION(mb_strcut)
}
/* if "length" position is negative, set it to the length
* needed to stop that many chars from the end of the string
*/
* needed to stop that many chars from the end of the string */
if (len < 0) {
len = (string.len - from) + len;
if (len < 0) {
@ -2437,12 +2437,14 @@ PHP_FUNCTION(mb_strcut)
RETURN_EMPTY_STRING();
}
ret = mbfl_strcut(&string, &result, from, len);
ZEND_ASSERT(ret != NULL);
// TODO: avoid reallocation ???
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
efree(ret->val);
if (enc->cut) {
RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
} else {
ret = mbfl_strcut(&string, &result, from, len);
ZEND_ASSERT(ret != NULL);
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
efree(ret->val);
}
}
/* }}} */