From 9ac49c0dd3c79decd80421eabef12c9e6f992aaf Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Tue, 12 Jul 2022 21:00:35 +0200 Subject: [PATCH] New implementation of mb_convert_kana mb_convert_kana now uses the new text encoding conversion filters. Microbenchmarking shows speed gains of 50%-150% across various text encodings and input string lengths. The behavior is the same as the old mb_convert_kana except for one fix: if the 'zero codepoint' U+0000 appeared in the input, the old implementation would sometimes drop it, not passing it through to the output. This is now fixed. --- ext/mbstring/config.m4 | 1 - ext/mbstring/config.w32 | 3 +- .../libmbfl/filters/mbfilter_cp5022x.c | 11 +- .../filters/mbfilter_tl_jisx0201_jisx0208.c | 252 --------------- .../filters/mbfilter_tl_jisx0201_jisx0208.h | 56 ---- .../filters/translit_kana_jisx0201_jisx0208.h | 22 ++ ext/mbstring/libmbfl/mbfl/mbfilter.c | 81 ----- ext/mbstring/libmbfl/mbfl/mbfilter.h | 6 - ext/mbstring/mbstring.c | 286 ++++++++++++++++-- ext/mbstring/tests/mb_convert_kana.phpt | 11 + 10 files changed, 308 insertions(+), 421 deletions(-) delete mode 100644 ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c delete mode 100644 ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h diff --git a/ext/mbstring/config.m4 b/ext/mbstring/config.m4 index 65b8f2c4a60..7b16ff75acc 100644 --- a/ext/mbstring/config.m4 +++ b/ext/mbstring/config.m4 @@ -118,7 +118,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [ libmbfl/filters/mbfilter_sjis_mobile.c libmbfl/filters/mbfilter_sjis_mac.c libmbfl/filters/mbfilter_sjis_2004.c - libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c libmbfl/filters/mbfilter_ucs2.c libmbfl/filters/mbfilter_ucs4.c libmbfl/filters/mbfilter_uhc.c diff --git a/ext/mbstring/config.w32 b/ext/mbstring/config.w32 index d415be6cf74..78350bb1e13 100644 --- a/ext/mbstring/config.w32 +++ b/ext/mbstring/config.w32 @@ -28,8 +28,7 @@ if (PHP_MBSTRING != "no") { mbfilter_utf8_mobile.c mbfilter_uuencode.c \ mbfilter_cp5022x.c mbfilter_sjis_mobile.c \ mbfilter_sjis_mac.c \ - mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \ - mbfilter_tl_jisx0201_jisx0208.c", "mbstring"); + mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c", "mbstring"); ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \ mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index f9e64c32589..a28895a9d02 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -25,11 +25,11 @@ #include "mbfilter.h" #include "mbfilter_cp5022x.h" #include "mbfilter_jis.h" -#include "mbfilter_tl_jisx0201_jisx0208.h" #include "unicode_table_cp932_ext.h" #include "unicode_table_jis.h" #include "cp932_table.h" +#include "translit_kana_jisx0201_jisx0208.h" static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter); @@ -40,6 +40,9 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +/* See mbstring.c */ +uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode); + /* Previously, a dubious 'encoding' called 'cp50220raw' was supported * This was just CP50220, but the implementation was less strict regarding * invalid characters; it would silently pass some through @@ -336,7 +339,7 @@ static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter) bool consumed = false; if (filter->cache) { - int s = mbfl_convert_kana(filter->cache, c, &consumed, NULL, mode); + int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode); filter->cache = consumed ? 0 : c; /* Terrible hack to get CP50220 to emit error markers in the proper * position, not reordering them with subsequent characters */ @@ -359,7 +362,7 @@ static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter) int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE; if (filter->cache) { - int s = mbfl_convert_kana(filter->cache, 0, NULL, NULL, mode); + int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode); mbfl_filt_conv_wchar_cp50221(s, filter); filter->cache = 0; } @@ -866,7 +869,7 @@ reprocess_codepoint: buf->state |= w << 8; break; } else { - w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE); + w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE); } if (consumed) { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c b/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c deleted file mode 100644 index 58feeedb8a9..00000000000 --- a/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c +++ /dev/null @@ -1,252 +0,0 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The author of this file: Moriyoshi Koizumi - * - */ - -#include -#include -#include "mbfilter_tl_jisx0201_jisx0208.h" -#include "translit_kana_jisx0201_jisx0208.h" - -/* Apply various transforms to input codepoint, such as converting halfwidth katakana - * to fullwidth katakana. `mode` is a bitfield which controls which transforms are - * actually performed. The bit values are defined in mbfilter_tl_jisx0201_jix0208.h. - * `mode` must not call for transforms which are inverses (i.e. which would cancel - * each other out). - * - * In some cases, successive input codepoints may be merged into one output codepoint. - * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed - * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed` - * will not be modified. If there is no following codepoint, `next` should be zero. - * - * Again, in some cases, one input codepoint may convert to two output codepoints. - * If so, the second output codepoint will be stored in `*second`. - * - * Return the resulting codepoint. If none of the requested transforms apply, return - * the input codepoint unchanged. - */ -int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode) -{ - if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') { - return c + 0xfee0; - } else if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) { - return c + 0xfee0; - } else if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') { - return c + 0xfee0; - } else if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') { - return 0x3000; - } - - if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) { - /* Convert Hankaku kana to Zenkaku kana - * Either all Hankaku kana (including katakana and hiragana) will be converted - * to Zenkaku katakana, or to Zenkaku hiragana */ - if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) { - if (c >= 0xff61 && c <= 0xff9f) { - int n = c - 0xff60; - if (next >= 0xff61 && next <= 0xff9f) { - if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) { - *consumed = true; - return 0x3001 + hankana2zenkana_table[n]; - } else if (next == 0xff9e && n == 19) { - *consumed = true; - return 0x30f4; - } else if (next == 0xff9f && n >= 42 && n <= 46) { - *consumed = true; - return 0x3002 + hankana2zenkana_table[n]; - } - } - - return 0x3000 + hankana2zenkana_table[n]; - } - } else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) { - if (c >= 0xff61 && c <= 0xff9f) { - int n = c - 0xff60; - if (next >= 0xff61 && next <= 0xff9f) { - if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) { - *consumed = true; - return 0x3001 + hankana2zenhira_table[n]; - } else if (next == 0xff9f && n >= 42 && n <= 46) { - *consumed = true; - return 0x3002 + hankana2zenhira_table[n]; - } - } - - return 0x3000 + hankana2zenhira_table[n]; - } - } else if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) { - return 0x3000 + hankana2zenkana_table[c - 0xff60]; - } else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) { - return 0x3000 + hankana2zenhira_table[c - 0xff60]; - } - } - - if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */ - if (c == 0x5c) { - return 0xffe5; /* FULLWIDTH YEN SIGN */ - } else if (c == 0xa5) { /* YEN SIGN */ - return 0xffe5; /* FULLWIDTH YEN SIGN */ - } else if (c == 0x7e) { - return 0xffe3; /* FULLWIDTH MACRON */ - } else if (c == 0x203e) { /* OVERLINE */ - return 0xffe3; /* FULLWIDTH MACRON */ - } else if (c == 0x27) { - return 0x2019; /* RIGHT SINGLE QUOTATION MARK */ - } else if (c == 0x22) { - return 0x201d; /* RIGHT DOUBLE QUOTATION MARK */ - } - } - - if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) { - /* Zenkaku to Hankaku */ - if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c != 0xff3c) { - /* all except " ' \ ~ */ - return c - 0xfee0; - } else if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) { - return c - 0xfee0; - } else if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) { - return c - 0xfee0; - } else if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) { - return 0x20; - } else if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */ - return 0x2d; - } - } - - if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) { - /* Zenkaku kana to hankaku kana */ - if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) { - /* Zenkaku katakana to hankaku kana */ - int n = c - 0x30a1; - if (zenkana2hankana_table[n][1]) { - *second = 0xff00 + zenkana2hankana_table[n][1]; - } - return 0xff00 + zenkana2hankana_table[n][0]; - } else if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) { - /* Zenkaku hiragana to hankaku kana */ - int n = c - 0x3041; - if (zenkana2hankana_table[n][1]) { - *second = 0xff00 + zenkana2hankana_table[n][1]; - } - return 0xff00 + zenkana2hankana_table[n][0]; - } else if (c == 0x3001) { - return 0xff64; /* HALFWIDTH IDEOGRAPHIC COMMA */ - } else if (c == 0x3002) { - return 0xff61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */ - } else if (c == 0x300c) { - return 0xff62; /* HALFWIDTH LEFT CORNER BRACKET */ - } else if (c == 0x300d) { - return 0xff63; /* HALFWIDTH RIGHT CORNER BRACKET */ - } else if (c == 0x309b) { - return 0xff9e; /* HALFWIDTH KATAKANA VOICED SOUND MARK */ - } else if (c == 0x309c) { - return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ - } else if (c == 0x30fc) { - return 0xff70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ - } else if (c == 0x30fb) { - return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */ - } - } else if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) { - if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) { - /* Zenkaku hiragana to Zenkaku katakana */ - return c + 0x60; - } else if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) { - /* Zenkaku katakana to Zenkaku hiragana */ - return c - 0x60; - } - } - - if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */ - if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */ - return 0x5c; - } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ - return 0x5c; - } else if (c == 0xffe3) { /* FULLWIDTH MACRON */ - return 0x7e; - } else if (c == 0x203e) { /* OVERLINE */ - return 0x7e; - } else if (c == 0x2018) { /* LEFT SINGLE QUOTATION MARK*/ - return 0x27; - } else if (c == 0x2019) { /* RIGHT SINGLE QUOTATION MARK */ - return 0x27; - } else if (c == 0x201c) { /* LEFT DOUBLE QUOTATION MARK */ - return 0x22; - } else if (c == 0x201d) { /* RIGHT DOUBLE QUOTATION MARK */ - return 0x22; - } - } - - return c; -} - -int mbfl_filt_tl_jisx0201_jisx0208(int c, mbfl_convert_filter *filt) -{ - int mode = (intptr_t)filt->opaque, second = 0; - bool consumed = false; - - if (filt->cache) { - int s = mbfl_convert_kana(filt->cache, c, &consumed, &second, mode); - filt->cache = consumed ? 0 : c; - (*filt->output_function)(s, filt->data); - if (second) { - (*filt->output_function)(second, filt->data); - } - } else if (c == 0) { - /* This case has to be handled separately, since `filt->cache == 0` means no - * codepoint is cached */ - (*filt->output_function)(0, filt->data); - } else { - filt->cache = c; - } - - return 0; -} - -int mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter *filt) -{ - int mode = (intptr_t)filt->opaque, second = 0; - - if (filt->cache) { - int s = mbfl_convert_kana(filt->cache, 0, NULL, &second, mode); - (*filt->output_function)(s, filt->data); - if (second) { - (*filt->output_function)(second, filt->data); - } - filt->cache = 0; - } - - if (filt->flush_function) { - return (*filt->flush_function)(filt->data); - } - - return 0; -} - -const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_tl_jisx0201_jisx0208, - mbfl_filt_tl_jisx0201_jisx0208_flush, - NULL, -}; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h b/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h deleted file mode 100644 index 844a858e71b..00000000000 --- a/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * "streamable kanji code filter and converter" - * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved. - * - * LICENSE NOTICES - * - * This file is part of "streamable kanji code filter and converter", - * which is distributed under the terms of GNU Lesser General Public - * License (version 2) as published by the Free Software Foundation. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with "streamable kanji code filter and converter"; - * if not, write to the Free Software Foundation, Inc., 59 Temple Place, - * Suite 330, Boston, MA 02111-1307 USA - * - * The author of this file: Moriyoshi Koizumi - * - */ - -#ifndef MBFILTER_TL_KANA_JISX0201_JISX0208_H -#define MBFILTER_TL_KANA_JISX0201_JISX0208_H - -#include "mbfl_convert.h" - -/* "Zen" is 全, or "full"; "Han" is 半, or "half" - * This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */ -#define MBFL_HAN2ZEN_ALL 0x00001 -#define MBFL_HAN2ZEN_ALPHA 0x00002 -#define MBFL_HAN2ZEN_NUMERIC 0x00004 -#define MBFL_HAN2ZEN_SPACE 0x00008 -#define MBFL_HAN2ZEN_KATAKANA 0x00010 -#define MBFL_HAN2ZEN_HIRAGANA 0x00020 -#define MBFL_HAN2ZEN_SPECIAL 0x00040 -#define MBFL_ZENKAKU_HIRA2KATA 0x00080 - -#define MBFL_ZEN2HAN_ALL 0x00100 -#define MBFL_ZEN2HAN_ALPHA 0x00200 -#define MBFL_ZEN2HAN_NUMERIC 0x00400 -#define MBFL_ZEN2HAN_SPACE 0x00800 -#define MBFL_ZEN2HAN_KATAKANA 0x01000 -#define MBFL_ZEN2HAN_HIRAGANA 0x02000 -#define MBFL_ZEN2HAN_SPECIAL 0x04000 -#define MBFL_ZENKAKU_KATA2HIRA 0x08000 - -#define MBFL_HAN2ZEN_GLUE 0x10000 - -extern const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208; - -int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode); - -#endif /* MBFILTER_TL_KANA_JISX0201_JISX0208_H */ diff --git a/ext/mbstring/libmbfl/filters/translit_kana_jisx0201_jisx0208.h b/ext/mbstring/libmbfl/filters/translit_kana_jisx0201_jisx0208.h index b30efce5d7e..545333928b7 100644 --- a/ext/mbstring/libmbfl/filters/translit_kana_jisx0201_jisx0208.h +++ b/ext/mbstring/libmbfl/filters/translit_kana_jisx0201_jisx0208.h @@ -25,6 +25,28 @@ #ifndef TRANSLIT_KANA_JISX0201_JISX0208_H #define TRANSLIT_KANA_JISX0201_JISX0208_H +/* "Zen" is 全, or "full"; "Han" is 半, or "half" + * This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */ +#define MBFL_HAN2ZEN_ALL 0x00001 +#define MBFL_HAN2ZEN_ALPHA 0x00002 +#define MBFL_HAN2ZEN_NUMERIC 0x00004 +#define MBFL_HAN2ZEN_SPACE 0x00008 +#define MBFL_HAN2ZEN_KATAKANA 0x00010 +#define MBFL_HAN2ZEN_HIRAGANA 0x00020 +#define MBFL_HAN2ZEN_SPECIAL 0x00040 +#define MBFL_ZENKAKU_HIRA2KATA 0x00080 + +#define MBFL_ZEN2HAN_ALL 0x00100 +#define MBFL_ZEN2HAN_ALPHA 0x00200 +#define MBFL_ZEN2HAN_NUMERIC 0x00400 +#define MBFL_ZEN2HAN_SPACE 0x00800 +#define MBFL_ZEN2HAN_KATAKANA 0x01000 +#define MBFL_ZEN2HAN_HIRAGANA 0x02000 +#define MBFL_ZEN2HAN_SPECIAL 0x04000 +#define MBFL_ZENKAKU_KATA2HIRA 0x08000 + +#define MBFL_HAN2ZEN_GLUE 0x10000 + static const unsigned char hankana2zenkana_table[64] = { 0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5, 0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6, diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 8d94c52f5d1..c2d7f3a4227 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -91,7 +91,6 @@ #include "filters/mbfilter_base64.h" #include "filters/mbfilter_qprint.h" #include "filters/mbfilter_singlebyte.h" -#include "filters/mbfilter_tl_jisx0201_jisx0208.h" #include "filters/mbfilter_utf8.h" #include "eaw_table.h" @@ -1391,86 +1390,6 @@ mbfl_strimwidth( return result; } -mbfl_string * -mbfl_ja_jp_hantozen( - mbfl_string *string, - mbfl_string *result, - int mode) -{ - size_t n; - unsigned char *p; - mbfl_memory_device device; - mbfl_convert_filter *decoder = NULL; - mbfl_convert_filter *encoder = NULL; - mbfl_convert_filter *tl_filter = NULL; - mbfl_convert_filter *next_filter = NULL; - - mbfl_memory_device_init(&device, string->len, 0); - mbfl_string_init(result); - - result->encoding = string->encoding; - - decoder = mbfl_convert_filter_new( - &mbfl_encoding_wchar, - string->encoding, - mbfl_memory_device_output, 0, &device); - if (decoder == NULL) { - goto out; - } - next_filter = decoder; - - tl_filter = mbfl_convert_filter_new2( - &vtbl_tl_jisx0201_jisx0208, - (int(*)(int, void*))next_filter->filter_function, - (flush_function_t)next_filter->filter_flush, - next_filter); - if (tl_filter == NULL) { - goto out; - } - - tl_filter->opaque = (void*)((intptr_t)mode); - next_filter = tl_filter; - - encoder = mbfl_convert_filter_new( - string->encoding, - &mbfl_encoding_wchar, - (int(*)(int, void*))next_filter->filter_function, - (flush_function_t)next_filter->filter_flush, - next_filter); - if (encoder == NULL) { - goto out; - } - - /* feed data */ - p = string->val; - n = string->len; - if (p != NULL) { - while (n > 0) { - if ((*encoder->filter_function)(*p++, encoder) < 0) { - break; - } - n--; - } - } - - mbfl_convert_filter_flush(encoder); - result = mbfl_memory_device_result(&device, result); -out: - if (tl_filter != NULL) { - mbfl_convert_filter_delete(tl_filter); - } - - if (decoder != NULL) { - mbfl_convert_filter_delete(decoder); - } - - if (encoder != NULL) { - mbfl_convert_filter_delete(encoder); - } - - return result; -} - /* * MIME header encode diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.h b/ext/mbstring/libmbfl/mbfl/mbfilter.h index cc1a573efaa..e0511ba8722 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.h +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.h @@ -288,10 +288,4 @@ mbfl_mime_header_decode( mbfl_string *result, const mbfl_encoding *outcode); -/* - * convert of halfwidth and fullwidth for japanese - */ -MBFLAPI extern mbfl_string * -mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, int mode); - #endif /* MBFL_MBFILTER_H */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 022f6958099..53525b7fb2d 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -40,8 +40,8 @@ #include "libmbfl/filters/mbfilter_uuencode.h" #include "libmbfl/filters/mbfilter_ucs4.h" #include "libmbfl/filters/mbfilter_utf8.h" -#include "libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h" #include "libmbfl/filters/mbfilter_singlebyte.h" +#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h" #include "php_variables.h" #include "php_globals.h" @@ -2838,37 +2838,291 @@ PHP_FUNCTION(mb_decode_mimeheader) } /* }}} */ +/* Apply various transforms to input codepoint, such as converting halfwidth katakana + * to fullwidth katakana. `mode` is a bitfield which controls which transforms are + * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h. + * `mode` must not call for transforms which are inverses (i.e. which would cancel + * each other out). + * + * In some cases, successive input codepoints may be merged into one output codepoint. + * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed + * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed` + * will not be modified. If there is no following codepoint, `next` should be zero. + * + * Again, in some cases, one input codepoint may convert to two output codepoints. + * If so, the second output codepoint will be stored in `*second`. + * + * Return the resulting codepoint. If none of the requested transforms apply, return + * the input codepoint unchanged. + */ +uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode) +{ + if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') { + return c + 0xFEE0; + } + if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) { + return c + 0xFEE0; + } + if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') { + return c + 0xFEE0; + } + if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') { + return 0x3000; + } + + if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) { + /* Convert Hankaku kana to Zenkaku kana + * Either all Hankaku kana (including katakana and hiragana) will be converted + * to Zenkaku katakana, or to Zenkaku hiragana */ + if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) { + if (c >= 0xFF61 && c <= 0xFF9F) { + int n = c - 0xFF60; + + if (next >= 0xFF61 && next <= 0xFF9F) { + if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) { + *consumed = true; + return 0x3001 + hankana2zenkana_table[n]; + } + if (next == 0xFF9E && n == 19) { + *consumed = true; + return 0x30F4; + } + if (next == 0xFF9F && n >= 42 && n <= 46) { + *consumed = true; + return 0x3002 + hankana2zenkana_table[n]; + } + } + + return 0x3000 + hankana2zenkana_table[n]; + } + } + if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) { + if (c >= 0xFF61 && c <= 0xFF9F) { + int n = c - 0xFF60; + + if (next >= 0xFF61 && next <= 0xFF9F) { + if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) { + *consumed = true; + return 0x3001 + hankana2zenhira_table[n]; + } + if (next == 0xFF9F && n >= 42 && n <= 46) { + *consumed = true; + return 0x3002 + hankana2zenhira_table[n]; + } + } + + return 0x3000 + hankana2zenhira_table[n]; + } + } + if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) { + return 0x3000 + hankana2zenkana_table[c - 0xFF60]; + } + if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) { + return 0x3000 + hankana2zenhira_table[c - 0xFF60]; + } + } + + if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */ + if (c == '\\' || c == 0xA5) { /* YEN SIGN */ + return 0xFFE5; /* FULLWIDTH YEN SIGN */ + } + if (c == 0x7E || c == 0x203E) { + return 0xFFE3; /* FULLWIDTH MACRON */ + } + if (c == '\'') { + return 0x2019; /* RIGHT SINGLE QUOTATION MARK */ + } + if (c == '"') { + return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */ + } + } + + if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) { + /* Zenkaku to Hankaku */ + if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) { + /* all except " ' \ ~ */ + return c - 0xFEE0; + } + if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) { + return c - 0xFEE0; + } + if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) { + return c - 0xFEE0; + } + if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) { + return ' '; + } + if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */ + return '-'; + } + } + + if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) { + /* Zenkaku kana to hankaku kana */ + if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) { + /* Zenkaku katakana to hankaku kana */ + int n = c - 0x30A1; + if (zenkana2hankana_table[n][1]) { + *second = 0xFF00 + zenkana2hankana_table[n][1]; + } + return 0xFF00 + zenkana2hankana_table[n][0]; + } + if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) { + /* Zenkaku hiragana to hankaku kana */ + int n = c - 0x3041; + if (zenkana2hankana_table[n][1]) { + *second = 0xFF00 + zenkana2hankana_table[n][1]; + } + return 0xFF00 + zenkana2hankana_table[n][0]; + } + if (c == 0x3001) { + return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */ + } + if (c == 0x3002) { + return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */ + } + if (c == 0x300C) { + return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */ + } + if (c == 0x300D) { + return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */ + } + if (c == 0x309B) { + return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */ + } + if (c == 0x309C) { + return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ + } + if (c == 0x30FC) { + return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + } + if (c == 0x30FB) { + return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */ + } + } + + if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) { + if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) { + /* Zenkaku hiragana to Zenkaku katakana */ + return c + 0x60; + } + if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) { + /* Zenkaku katakana to Zenkaku hiragana */ + return c - 0x60; + } + } + + if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */ + if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */ + return '\\'; + } + if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */ + return '~'; + } + if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/ + return '\''; + } + if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */ + return '"'; + } + } + + return c; +} + +static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode) +{ + /* Each wchar may potentially expand to 2 when we perform kana conversion... + * if we are converting zenkaku kana to hankaku kana + * Make the buffer for converted kana big enough that we never need to + * perform bounds checks */ + uint32_t wchar_buf[64], converted_buf[64 * 2]; + unsigned int buf_offset = 0; + unsigned int state = 0; + unsigned char *in = (unsigned char*)ZSTR_VAL(input); + size_t in_len = ZSTR_LEN(input); + + mb_convert_buf buf; + mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode)); + + while (in_len) { + uint32_t *converted = converted_buf; + /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the + * previous iteration, don't overwrite it */ + size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state); + out_len += buf_offset; + ZEND_ASSERT(out_len <= 64); + + if (!out_len) { + continue; + } + + for (int i = 0; i < out_len-1; i++) { + uint32_t second = 0; + bool consumed = false; + *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode); + if (second) { + *converted++ = second; + } + if (consumed) { + i++; + if (i == out_len-1) { + /* We consumed two codepoints at the very end of the wchar buffer + * So there is nothing remaining to reprocess on the next iteration */ + buf_offset = 0; + goto emit_converted_kana; + } + } + } + + if (!in_len) { + /* This is the last iteration, so we need to process the final codepoint now */ + uint32_t second = 0; + *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode); + if (second) { + *converted++ = second; + } + } else { + /* Reprocess the last codepoint on the next iteration */ + wchar_buf[0] = wchar_buf[out_len-1]; + buf_offset = 1; + } + +emit_converted_kana: + encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len); + } + + return mb_convert_buf_result(&buf); +} + char mb_convert_kana_flags[17] = { 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C', 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c', 'V' }; -/* {{{ Conversion between full-width character and half-width character (Japanese) */ +/* Conversion between full-width characters and half-width characters (Japanese) */ PHP_FUNCTION(mb_convert_kana) { - int opt; - mbfl_string string, result, *ret; - char *optstr = NULL, *string_val; + unsigned int opt; + char *optstr = NULL; size_t optstr_len; - zend_string *encname = NULL; + zend_string *encname = NULL, *str; ZEND_PARSE_PARAMETERS_START(1, 3) - Z_PARAM_STRING(string_val, string.len) + Z_PARAM_STR(str) Z_PARAM_OPTIONAL Z_PARAM_STRING(optstr, optstr_len) Z_PARAM_STR_OR_NULL(encname) ZEND_PARSE_PARAMETERS_END(); - string.val = (unsigned char*)string_val; - if (optstr != NULL) { char *p = optstr, *e = p + optstr_len; opt = 0; next_option: while (p < e) { /* Walk through option string and convert to bit vector - * See mbfilter_tl_jisx0201_jisx0208.h for the values used */ + * See translit_kana_jisx0201_jisx0208.h for the values used */ char c = *p++; if (c == 'A') { opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC; @@ -2936,19 +3190,13 @@ next_option: opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE; } - /* encoding */ - string.encoding = php_mb_get_encoding(encname, 3); - if (!string.encoding) { + const mbfl_encoding *enc = php_mb_get_encoding(encname, 3); + if (!enc) { RETURN_THROWS(); } - ret = mbfl_ja_jp_hantozen(&string, &result, opt); - ZEND_ASSERT(ret != NULL); - // TODO: avoid reallocation ??? - RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */ - efree(ret->val); + RETVAL_STR(jp_kana_convert(str, enc, opt)); } -/* }}} */ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */ { diff --git a/ext/mbstring/tests/mb_convert_kana.phpt b/ext/mbstring/tests/mb_convert_kana.phpt index 99c17d68d5a..69ea5560d48 100644 --- a/ext/mbstring/tests/mb_convert_kana.phpt +++ b/ext/mbstring/tests/mb_convert_kana.phpt @@ -132,6 +132,17 @@ try { echo $e->getMessage() . "\n"; } +// Regression test: Two codepoints collapsed into one, just one position +// before the end of the string +$converted = mb_convert_kana("\xb9\xde\xde", 'HV', 'JIS'); +if ($converted !== "\x1b\$B\$2!+\x1b(B") + echo "Failed! Expected " . bin2hex("\x1b\$B\$2!+\x1b(B") . ", got: " . bin2hex($converted) . "\n"; + +// Regression test: the old implementation of mb_convert_kana would swallow +// zero bytes in some cases +if (mb_convert_kana("abc\x00abc", 'c', 'ASCII') !== "abc\x00abc") + echo "mb_convert_kana is swallowing zero bytes!\n"; + ?> --EXPECT-- 'A': ァアィイゥウェエォオカガキギク => ァアィイゥウェエォオカガキギク