mirror of
https://github.com/php/php-src.git
synced 2024-11-23 18:04:36 +08:00
Add fast mb_strcut implementation for UTF-8
The old implementation runs through the entire string to pick out the part which should be returned by mb_strcut. This creates significant performance overhead. The new specialized implementation of mb_strcut for UTF-8 usually only examines a few bytes around the starting and ending cut points, meaning it generally runs in constant time. For UTF-8 strings just a few bytes long, the new implementation is around 10% faster (according to microbenchmarks which I ran locally). For strings around 10,000 bytes in length, it is 50-300x faster. (Yes, that is 300x and not 300%.) The new implementation behaves identically to the old one on VALID UTF-8 strings; a fuzzer was used to help ensure this is the case. On invalid UTF-8 strings, there is a difference: in some cases, the old implementation will pass invalid byte sequences through unchanged, while in others it will remove them. The new implementation has behavior which is perhaps slightly more predictable: it simply backs up the starting and ending cut points to the preceding "starter byte" (one which is not a UTF-8 continuation byte).
This commit is contained in:
parent
3fa836f711
commit
1f0cf133db
@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_7bit = {
|
||||
&vtbl_wchar_7bit,
|
||||
mb_7bit_to_wchar,
|
||||
mb_wchar_to_7bit,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
|
||||
|
@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
|
||||
NULL,
|
||||
mb_base64_to_wchar,
|
||||
mb_wchar_to_base64,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_b64 = {
|
||||
|
@ -4392,7 +4392,8 @@ const mbfl_encoding mbfl_encoding_jis = {
|
||||
&vtbl_wchar_jis,
|
||||
mb_iso2022jp_to_wchar,
|
||||
mb_wchar_to_jis,
|
||||
mb_check_jis
|
||||
mb_check_jis,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
|
||||
@ -4426,7 +4427,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
|
||||
&vtbl_wchar_2022jp,
|
||||
mb_iso2022jp_to_wchar,
|
||||
mb_wchar_to_iso2022jp,
|
||||
mb_check_iso2022jp
|
||||
mb_check_iso2022jp,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
|
||||
@ -4462,7 +4464,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
|
||||
&vtbl_wchar_2022jp_kddi,
|
||||
mb_iso2022jp_kddi_to_wchar,
|
||||
mb_wchar_to_iso2022jp_kddi,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
|
||||
@ -4496,7 +4499,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
|
||||
&vtbl_wchar_2022jp_2004,
|
||||
mb_iso2022jp2004_to_wchar,
|
||||
mb_wchar_to_iso2022jp2004,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
|
||||
@ -4581,7 +4585,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
|
||||
&vtbl_wchar_cp50220,
|
||||
mb_cp5022x_to_wchar,
|
||||
mb_wchar_to_cp50220,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp50221 = {
|
||||
@ -4595,7 +4600,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
|
||||
&vtbl_wchar_cp50221,
|
||||
mb_cp5022x_to_wchar,
|
||||
mb_wchar_to_cp50221,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp50222 = {
|
||||
@ -4609,7 +4615,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
|
||||
&vtbl_wchar_cp50222,
|
||||
mb_cp5022x_to_wchar,
|
||||
mb_wchar_to_cp50222,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};
|
||||
@ -4645,7 +4652,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
|
||||
&vtbl_wchar_2022jpms,
|
||||
mb_iso2022jpms_to_wchar,
|
||||
mb_wchar_to_iso2022jpms,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* ISO-2022-KR is defined in RFC 1557
|
||||
@ -4687,7 +4695,8 @@ const mbfl_encoding mbfl_encoding_2022kr = {
|
||||
&vtbl_wchar_2022kr,
|
||||
mb_iso2022kr_to_wchar,
|
||||
mb_wchar_to_iso2022kr,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -7832,7 +7841,8 @@ const mbfl_encoding mbfl_encoding_sjis = {
|
||||
&vtbl_wchar_sjis,
|
||||
mb_sjis_to_wchar,
|
||||
mb_wchar_to_sjis,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL};
|
||||
@ -7868,7 +7878,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
|
||||
&vtbl_wchar_sjis_mac,
|
||||
mb_sjismac_to_wchar,
|
||||
mb_wchar_to_sjismac,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_jis-imode", "x-sjis-emoji-docomo", NULL};
|
||||
@ -7906,7 +7917,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
|
||||
&vtbl_wchar_sjis_docomo,
|
||||
mb_sjis_docomo_to_wchar,
|
||||
mb_wchar_to_sjis_docomo,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = {
|
||||
@ -7940,7 +7952,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
|
||||
&vtbl_wchar_sjis_kddi,
|
||||
mb_sjis_kddi_to_wchar,
|
||||
mb_wchar_to_sjis_kddi,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = {
|
||||
@ -7974,7 +7987,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
|
||||
&vtbl_wchar_sjis_sb,
|
||||
mb_sjis_sb_to_wchar,
|
||||
mb_wchar_to_sjis_sb,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* Although the specification for Shift-JIS-2004 indicates that 0x5C and
|
||||
@ -8017,7 +8031,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
|
||||
&vtbl_wchar_sjis2004,
|
||||
mb_sjis2004_to_wchar,
|
||||
mb_wchar_to_sjis2004,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* CP932 is Microsoft's version of Shift-JIS.
|
||||
@ -8103,7 +8118,8 @@ const mbfl_encoding mbfl_encoding_cp932 = {
|
||||
&vtbl_wchar_cp932,
|
||||
mb_cp932_to_wchar,
|
||||
mb_wchar_to_cp932,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
|
||||
@ -8137,7 +8153,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
|
||||
&vtbl_wchar_sjiswin,
|
||||
mb_cp932_to_wchar,
|
||||
mb_wchar_to_sjiswin,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -10346,7 +10363,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
|
||||
&vtbl_wchar_eucjp,
|
||||
mb_eucjp_to_wchar,
|
||||
mb_wchar_to_eucjp,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL};
|
||||
@ -10382,7 +10400,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
|
||||
&vtbl_wchar_eucjp2004,
|
||||
mb_eucjp2004_to_wchar,
|
||||
mb_wchar_to_eucjp2004,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL};
|
||||
@ -10418,7 +10437,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
|
||||
&vtbl_wchar_eucjpwin,
|
||||
mb_eucjpwin_to_wchar,
|
||||
mb_wchar_to_eucjpwin,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
|
||||
@ -10454,7 +10474,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
|
||||
&vtbl_wchar_cp51932,
|
||||
mb_cp51932_to_wchar,
|
||||
mb_wchar_to_cp51932,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */
|
||||
@ -10509,7 +10530,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
|
||||
&vtbl_wchar_euccn,
|
||||
mb_euccn_to_wchar,
|
||||
mb_wchar_to_euccn,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
|
||||
@ -10545,7 +10567,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
|
||||
&vtbl_wchar_euctw,
|
||||
mb_euctw_to_wchar,
|
||||
mb_wchar_to_euctw,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
|
||||
@ -10581,7 +10604,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
|
||||
&vtbl_wchar_euckr,
|
||||
mb_euckr_to_wchar,
|
||||
mb_wchar_to_euckr,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/* UHC was introduced by MicroSoft in Windows 95, and is also known as CP949.
|
||||
@ -10640,7 +10664,8 @@ const mbfl_encoding mbfl_encoding_uhc = {
|
||||
&vtbl_wchar_uhc,
|
||||
mb_uhc_to_wchar,
|
||||
mb_wchar_to_uhc,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -11555,7 +11580,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
|
||||
&vtbl_wchar_gb18030,
|
||||
mb_gb18030_to_wchar,
|
||||
mb_wchar_to_gb18030,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL};
|
||||
@ -11591,7 +11617,8 @@ const mbfl_encoding mbfl_encoding_cp936 = {
|
||||
&vtbl_wchar_cp936,
|
||||
mb_cp936_to_wchar,
|
||||
mb_wchar_to_cp936,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -12160,7 +12187,8 @@ const mbfl_encoding mbfl_encoding_big5 = {
|
||||
&vtbl_wchar_big5,
|
||||
mb_big5_to_wchar,
|
||||
mb_wchar_to_big5,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct mbfl_convert_vtbl vtbl_cp950_wchar = {
|
||||
@ -12194,7 +12222,8 @@ const mbfl_encoding mbfl_encoding_cp950 = {
|
||||
&vtbl_wchar_cp950,
|
||||
mb_cp950_to_wchar,
|
||||
mb_wchar_to_cp950,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -12567,5 +12596,6 @@ const mbfl_encoding mbfl_encoding_hz = {
|
||||
&vtbl_wchar_hz,
|
||||
mb_hz_to_wchar,
|
||||
mb_wchar_to_hz,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
|
||||
&vtbl_wchar_html,
|
||||
mb_htmlent_to_wchar,
|
||||
mb_wchar_to_htmlent,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_wchar_html = {
|
||||
|
@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_qprint = {
|
||||
NULL,
|
||||
mb_qprint_to_wchar,
|
||||
mb_wchar_to_qprint,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_qprint = {
|
||||
|
@ -87,6 +87,7 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
|
||||
&vtbl_wchar_##id, \
|
||||
mb_##id##_to_wchar, \
|
||||
mb_wchar_to_##id, \
|
||||
NULL, \
|
||||
NULL \
|
||||
}
|
||||
|
||||
|
@ -57,7 +57,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
|
||||
&vtbl_wchar_ucs2,
|
||||
mb_ucs2_to_wchar,
|
||||
mb_wchar_to_ucs2be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs2be = {
|
||||
@ -71,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
|
||||
&vtbl_wchar_ucs2be,
|
||||
mb_ucs2be_to_wchar,
|
||||
mb_wchar_to_ucs2be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs2le = {
|
||||
@ -85,7 +87,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
|
||||
&vtbl_wchar_ucs2le,
|
||||
mb_ucs2le_to_wchar,
|
||||
mb_wchar_to_ucs2le,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {
|
||||
|
@ -57,7 +57,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
|
||||
&vtbl_wchar_ucs4,
|
||||
mb_ucs4_to_wchar,
|
||||
mb_wchar_to_ucs4be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs4be = {
|
||||
@ -71,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
|
||||
&vtbl_wchar_ucs4be,
|
||||
mb_ucs4be_to_wchar,
|
||||
mb_wchar_to_ucs4be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs4le = {
|
||||
@ -85,7 +87,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
|
||||
&vtbl_wchar_ucs4le,
|
||||
mb_ucs4le_to_wchar,
|
||||
mb_wchar_to_ucs4le,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_ucs4_wchar = {
|
||||
|
@ -189,7 +189,8 @@ const mbfl_encoding mbfl_encoding_utf16 = {
|
||||
&vtbl_wchar_utf16,
|
||||
mb_utf16_to_wchar,
|
||||
mb_wchar_to_utf16be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf16be = {
|
||||
@ -203,7 +204,8 @@ const mbfl_encoding mbfl_encoding_utf16be = {
|
||||
&vtbl_wchar_utf16be,
|
||||
mb_utf16be_to_wchar,
|
||||
mb_wchar_to_utf16be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf16le = {
|
||||
@ -217,7 +219,8 @@ const mbfl_encoding mbfl_encoding_utf16le = {
|
||||
&vtbl_wchar_utf16le,
|
||||
mb_utf16le_to_wchar,
|
||||
mb_wchar_to_utf16le,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
|
||||
|
@ -50,7 +50,8 @@ const mbfl_encoding mbfl_encoding_utf32 = {
|
||||
&vtbl_wchar_utf32,
|
||||
mb_utf32_to_wchar,
|
||||
mb_wchar_to_utf32be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32be = {
|
||||
@ -64,7 +65,8 @@ const mbfl_encoding mbfl_encoding_utf32be = {
|
||||
&vtbl_wchar_utf32be,
|
||||
mb_utf32be_to_wchar,
|
||||
mb_wchar_to_utf32be,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32le = {
|
||||
@ -78,7 +80,8 @@ const mbfl_encoding mbfl_encoding_utf32le = {
|
||||
&vtbl_wchar_utf32le,
|
||||
mb_utf32le_to_wchar,
|
||||
mb_wchar_to_utf32le,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
|
||||
|
@ -62,7 +62,8 @@ const mbfl_encoding mbfl_encoding_utf7 = {
|
||||
&vtbl_wchar_utf7,
|
||||
mb_utf7_to_wchar,
|
||||
mb_wchar_to_utf7,
|
||||
mb_check_utf7
|
||||
mb_check_utf7,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
|
||||
|
@ -98,7 +98,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = {
|
||||
&vtbl_wchar_utf7imap,
|
||||
mb_utf7imap_to_wchar,
|
||||
mb_wchar_to_utf7imap,
|
||||
mb_check_utf7imap
|
||||
mb_check_utf7imap,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {
|
||||
|
@ -51,6 +51,7 @@ const unsigned char mblen_table_utf8[] = {
|
||||
|
||||
static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end);
|
||||
|
||||
static const char *mbfl_encoding_utf8_aliases[] = {"utf8", NULL};
|
||||
|
||||
@ -65,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf8 = {
|
||||
&vtbl_wchar_utf8,
|
||||
mb_utf8_to_wchar,
|
||||
mb_wchar_to_utf8,
|
||||
NULL
|
||||
NULL,
|
||||
mb_cut_utf8
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
|
||||
@ -335,3 +337,21 @@ static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool
|
||||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, unsigned char *end)
|
||||
{
|
||||
unsigned char *start = str + from;
|
||||
/* Byte values less than -64 are UTF-8 continuation bytes, that is,
|
||||
* the 2nd, 3rd, or 4th byte of a multi-byte character */
|
||||
while (start > str && ((signed char)*start) < -64) {
|
||||
start--;
|
||||
}
|
||||
unsigned char *_end = start + len;
|
||||
if (_end >= end) {
|
||||
return zend_string_init_fast((char*)start, end - start);
|
||||
}
|
||||
while (_end > start && ((signed char)*_end) < -64) {
|
||||
_end--;
|
||||
}
|
||||
return zend_string_init_fast((char*)start, _end - start);
|
||||
}
|
||||
|
@ -124,7 +124,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = {
|
||||
&vtbl_wchar_utf8_docomo,
|
||||
mb_utf8_docomo_to_wchar,
|
||||
mb_wchar_to_utf8_docomo,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
|
||||
@ -138,7 +139,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
|
||||
&vtbl_wchar_utf8_kddi_a,
|
||||
mb_utf8_kddi_a_to_wchar,
|
||||
mb_wchar_to_utf8_kddi_a,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
|
||||
@ -152,7 +154,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
|
||||
&vtbl_wchar_utf8_kddi_b,
|
||||
mb_utf8_kddi_b_to_wchar,
|
||||
mb_wchar_to_utf8_kddi_b,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_sb = {
|
||||
@ -166,7 +169,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = {
|
||||
&vtbl_wchar_utf8_sb,
|
||||
mb_utf8_sb_to_wchar,
|
||||
mb_wchar_to_utf8_sb,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
|
||||
|
@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_uuencode = {
|
||||
NULL,
|
||||
mb_uuencode_to_wchar,
|
||||
mb_wchar_to_uuencode,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_uuencode_8bit = {
|
||||
|
@ -52,7 +52,8 @@ const mbfl_encoding mbfl_encoding_8bit = {
|
||||
&vtbl_wchar_8bit,
|
||||
mb_8bit_to_wchar,
|
||||
mb_wchar_to_8bit,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_wchar = {
|
||||
|
@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_pass = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_pass = {
|
||||
|
@ -43,5 +43,6 @@ const mbfl_encoding mbfl_encoding_wchar = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
@ -145,6 +145,7 @@ typedef struct {
|
||||
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
|
||||
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
|
||||
typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
|
||||
typedef zend_string* (*mb_cut_fn)(unsigned char *str, size_t from, size_t len, unsigned char *end);
|
||||
|
||||
/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
|
||||
* the buffer must be at least this size (to work with all supported text encodings) */
|
||||
@ -251,6 +252,7 @@ typedef struct {
|
||||
mb_to_wchar_fn to_wchar;
|
||||
mb_from_wchar_fn from_wchar;
|
||||
mb_check_fn check;
|
||||
mb_cut_fn cut;
|
||||
} mbfl_encoding;
|
||||
|
||||
extern const mbfl_encoding mbfl_encoding_utf8;
|
||||
|
@ -2403,19 +2403,20 @@ PHP_FUNCTION(mb_strcut)
|
||||
Z_PARAM_STR_OR_NULL(encoding)
|
||||
ZEND_PARSE_PARAMETERS_END();
|
||||
|
||||
string.val = (unsigned char*)string_val;
|
||||
string.encoding = php_mb_get_encoding(encoding, 4);
|
||||
if (!string.encoding) {
|
||||
const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
|
||||
if (!enc) {
|
||||
RETURN_THROWS();
|
||||
}
|
||||
|
||||
string.val = (unsigned char*)string_val;
|
||||
string.encoding = enc;
|
||||
|
||||
if (len_is_null) {
|
||||
len = string.len;
|
||||
}
|
||||
|
||||
/* if "from" position is negative, count start position from the end
|
||||
* of the string
|
||||
*/
|
||||
* of the string */
|
||||
if (from < 0) {
|
||||
from = string.len + from;
|
||||
if (from < 0) {
|
||||
@ -2424,8 +2425,7 @@ PHP_FUNCTION(mb_strcut)
|
||||
}
|
||||
|
||||
/* if "length" position is negative, set it to the length
|
||||
* needed to stop that many chars from the end of the string
|
||||
*/
|
||||
* needed to stop that many chars from the end of the string */
|
||||
if (len < 0) {
|
||||
len = (string.len - from) + len;
|
||||
if (len < 0) {
|
||||
@ -2437,12 +2437,14 @@ PHP_FUNCTION(mb_strcut)
|
||||
RETURN_EMPTY_STRING();
|
||||
}
|
||||
|
||||
ret = mbfl_strcut(&string, &result, from, len);
|
||||
ZEND_ASSERT(ret != NULL);
|
||||
|
||||
// TODO: avoid reallocation ???
|
||||
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
|
||||
efree(ret->val);
|
||||
if (enc->cut) {
|
||||
RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
|
||||
} else {
|
||||
ret = mbfl_strcut(&string, &result, from, len);
|
||||
ZEND_ASSERT(ret != NULL);
|
||||
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
|
||||
efree(ret->val);
|
||||
}
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user