New implementation of mb_convert_kana

mb_convert_kana now uses the new text encoding conversion
filters. Microbenchmarking shows speed gains of 50%-150%
across various text encodings and input string lengths.

The behavior is the same as the old mb_convert_kana
except for one fix: if the 'zero codepoint' U+0000 appeared
in the input, the old implementation would sometimes drop
it, not passing it through to the output. This is now
fixed.
This commit is contained in:
Alex Dowad 2022-07-12 21:00:35 +02:00
parent 840423dffa
commit 9ac49c0dd3
10 changed files with 308 additions and 421 deletions

View File

@ -118,7 +118,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [
libmbfl/filters/mbfilter_sjis_mobile.c
libmbfl/filters/mbfilter_sjis_mac.c
libmbfl/filters/mbfilter_sjis_2004.c
libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c
libmbfl/filters/mbfilter_ucs2.c
libmbfl/filters/mbfilter_ucs4.c
libmbfl/filters/mbfilter_uhc.c

View File

@ -28,8 +28,7 @@ if (PHP_MBSTRING != "no") {
mbfilter_utf8_mobile.c mbfilter_uuencode.c \
mbfilter_cp5022x.c mbfilter_sjis_mobile.c \
mbfilter_sjis_mac.c \
mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \
mbfilter_tl_jisx0201_jisx0208.c", "mbstring");
mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c", "mbstring");
ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \
mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \

View File

@ -25,11 +25,11 @@
#include "mbfilter.h"
#include "mbfilter_cp5022x.h"
#include "mbfilter_jis.h"
#include "mbfilter_tl_jisx0201_jisx0208.h"
#include "unicode_table_cp932_ext.h"
#include "unicode_table_jis.h"
#include "cp932_table.h"
#include "translit_kana_jisx0201_jisx0208.h"
static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
@ -40,6 +40,9 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
/* See mbstring.c */
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
* This was just CP50220, but the implementation was less strict regarding
* invalid characters; it would silently pass some through
@ -336,7 +339,7 @@ static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
bool consumed = false;
if (filter->cache) {
int s = mbfl_convert_kana(filter->cache, c, &consumed, NULL, mode);
int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
filter->cache = consumed ? 0 : c;
/* Terrible hack to get CP50220 to emit error markers in the proper
* position, not reordering them with subsequent characters */
@ -359,7 +362,7 @@ static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
if (filter->cache) {
int s = mbfl_convert_kana(filter->cache, 0, NULL, NULL, mode);
int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
mbfl_filt_conv_wchar_cp50221(s, filter);
filter->cache = 0;
}
@ -866,7 +869,7 @@ reprocess_codepoint:
buf->state |= w << 8;
break;
} else {
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
}
if (consumed) {

View File

@ -1,252 +0,0 @@
/*
* "streamable kanji code filter and converter"
* Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
*
* LICENSE NOTICES
*
* This file is part of "streamable kanji code filter and converter",
* which is distributed under the terms of GNU Lesser General Public
* License (version 2) as published by the Free Software Foundation.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with "streamable kanji code filter and converter";
* if not, write to the Free Software Foundation, Inc., 59 Temple Place,
* Suite 330, Boston, MA 02111-1307 USA
*
* The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
*
*/
#include <stdint.h>
#include <stdbool.h>
#include "mbfilter_tl_jisx0201_jisx0208.h"
#include "translit_kana_jisx0201_jisx0208.h"
/* Apply various transforms to input codepoint, such as converting halfwidth katakana
* to fullwidth katakana. `mode` is a bitfield which controls which transforms are
* actually performed. The bit values are defined in mbfilter_tl_jisx0201_jix0208.h.
* `mode` must not call for transforms which are inverses (i.e. which would cancel
* each other out).
*
* In some cases, successive input codepoints may be merged into one output codepoint.
* (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
* and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
* will not be modified. If there is no following codepoint, `next` should be zero.
*
* Again, in some cases, one input codepoint may convert to two output codepoints.
* If so, the second output codepoint will be stored in `*second`.
*
* Return the resulting codepoint. If none of the requested transforms apply, return
* the input codepoint unchanged.
*/
int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
{
if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
return c + 0xfee0;
} else if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
return c + 0xfee0;
} else if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
return c + 0xfee0;
} else if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
return 0x3000;
}
if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
/* Convert Hankaku kana to Zenkaku kana
* Either all Hankaku kana (including katakana and hiragana) will be converted
* to Zenkaku katakana, or to Zenkaku hiragana */
if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
if (c >= 0xff61 && c <= 0xff9f) {
int n = c - 0xff60;
if (next >= 0xff61 && next <= 0xff9f) {
if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
*consumed = true;
return 0x3001 + hankana2zenkana_table[n];
} else if (next == 0xff9e && n == 19) {
*consumed = true;
return 0x30f4;
} else if (next == 0xff9f && n >= 42 && n <= 46) {
*consumed = true;
return 0x3002 + hankana2zenkana_table[n];
}
}
return 0x3000 + hankana2zenkana_table[n];
}
} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
if (c >= 0xff61 && c <= 0xff9f) {
int n = c - 0xff60;
if (next >= 0xff61 && next <= 0xff9f) {
if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
*consumed = true;
return 0x3001 + hankana2zenhira_table[n];
} else if (next == 0xff9f && n >= 42 && n <= 46) {
*consumed = true;
return 0x3002 + hankana2zenhira_table[n];
}
}
return 0x3000 + hankana2zenhira_table[n];
}
} else if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
return 0x3000 + hankana2zenkana_table[c - 0xff60];
} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
return 0x3000 + hankana2zenhira_table[c - 0xff60];
}
}
if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
if (c == 0x5c) {
return 0xffe5; /* FULLWIDTH YEN SIGN */
} else if (c == 0xa5) { /* YEN SIGN */
return 0xffe5; /* FULLWIDTH YEN SIGN */
} else if (c == 0x7e) {
return 0xffe3; /* FULLWIDTH MACRON */
} else if (c == 0x203e) { /* OVERLINE */
return 0xffe3; /* FULLWIDTH MACRON */
} else if (c == 0x27) {
return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
} else if (c == 0x22) {
return 0x201d; /* RIGHT DOUBLE QUOTATION MARK */
}
}
if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
/* Zenkaku to Hankaku */
if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c != 0xff3c) {
/* all except " ' \ ~ */
return c - 0xfee0;
} else if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
return c - 0xfee0;
} else if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
return c - 0xfee0;
} else if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
return 0x20;
} else if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
return 0x2d;
}
}
if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
/* Zenkaku kana to hankaku kana */
if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
/* Zenkaku katakana to hankaku kana */
int n = c - 0x30a1;
if (zenkana2hankana_table[n][1]) {
*second = 0xff00 + zenkana2hankana_table[n][1];
}
return 0xff00 + zenkana2hankana_table[n][0];
} else if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
/* Zenkaku hiragana to hankaku kana */
int n = c - 0x3041;
if (zenkana2hankana_table[n][1]) {
*second = 0xff00 + zenkana2hankana_table[n][1];
}
return 0xff00 + zenkana2hankana_table[n][0];
} else if (c == 0x3001) {
return 0xff64; /* HALFWIDTH IDEOGRAPHIC COMMA */
} else if (c == 0x3002) {
return 0xff61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
} else if (c == 0x300c) {
return 0xff62; /* HALFWIDTH LEFT CORNER BRACKET */
} else if (c == 0x300d) {
return 0xff63; /* HALFWIDTH RIGHT CORNER BRACKET */
} else if (c == 0x309b) {
return 0xff9e; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
} else if (c == 0x309c) {
return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
} else if (c == 0x30fc) {
return 0xff70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
} else if (c == 0x30fb) {
return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */
}
} else if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
/* Zenkaku hiragana to Zenkaku katakana */
return c + 0x60;
} else if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
/* Zenkaku katakana to Zenkaku hiragana */
return c - 0x60;
}
}
if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */
return 0x5c;
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
return 0x5c;
} else if (c == 0xffe3) { /* FULLWIDTH MACRON */
return 0x7e;
} else if (c == 0x203e) { /* OVERLINE */
return 0x7e;
} else if (c == 0x2018) { /* LEFT SINGLE QUOTATION MARK*/
return 0x27;
} else if (c == 0x2019) { /* RIGHT SINGLE QUOTATION MARK */
return 0x27;
} else if (c == 0x201c) { /* LEFT DOUBLE QUOTATION MARK */
return 0x22;
} else if (c == 0x201d) { /* RIGHT DOUBLE QUOTATION MARK */
return 0x22;
}
}
return c;
}
int mbfl_filt_tl_jisx0201_jisx0208(int c, mbfl_convert_filter *filt)
{
int mode = (intptr_t)filt->opaque, second = 0;
bool consumed = false;
if (filt->cache) {
int s = mbfl_convert_kana(filt->cache, c, &consumed, &second, mode);
filt->cache = consumed ? 0 : c;
(*filt->output_function)(s, filt->data);
if (second) {
(*filt->output_function)(second, filt->data);
}
} else if (c == 0) {
/* This case has to be handled separately, since `filt->cache == 0` means no
* codepoint is cached */
(*filt->output_function)(0, filt->data);
} else {
filt->cache = c;
}
return 0;
}
int mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter *filt)
{
int mode = (intptr_t)filt->opaque, second = 0;
if (filt->cache) {
int s = mbfl_convert_kana(filt->cache, 0, NULL, &second, mode);
(*filt->output_function)(s, filt->data);
if (second) {
(*filt->output_function)(second, filt->data);
}
filt->cache = 0;
}
if (filt->flush_function) {
return (*filt->flush_function)(filt->data);
}
return 0;
}
const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208 = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_tl_jisx0201_jisx0208,
mbfl_filt_tl_jisx0201_jisx0208_flush,
NULL,
};

View File

@ -1,56 +0,0 @@
/*
* "streamable kanji code filter and converter"
* Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
*
* LICENSE NOTICES
*
* This file is part of "streamable kanji code filter and converter",
* which is distributed under the terms of GNU Lesser General Public
* License (version 2) as published by the Free Software Foundation.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with "streamable kanji code filter and converter";
* if not, write to the Free Software Foundation, Inc., 59 Temple Place,
* Suite 330, Boston, MA 02111-1307 USA
*
* The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
*
*/
#ifndef MBFILTER_TL_KANA_JISX0201_JISX0208_H
#define MBFILTER_TL_KANA_JISX0201_JISX0208_H
#include "mbfl_convert.h"
/* "Zen" is 全, or "full"; "Han" is 半, or "half"
* This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
#define MBFL_HAN2ZEN_ALL 0x00001
#define MBFL_HAN2ZEN_ALPHA 0x00002
#define MBFL_HAN2ZEN_NUMERIC 0x00004
#define MBFL_HAN2ZEN_SPACE 0x00008
#define MBFL_HAN2ZEN_KATAKANA 0x00010
#define MBFL_HAN2ZEN_HIRAGANA 0x00020
#define MBFL_HAN2ZEN_SPECIAL 0x00040
#define MBFL_ZENKAKU_HIRA2KATA 0x00080
#define MBFL_ZEN2HAN_ALL 0x00100
#define MBFL_ZEN2HAN_ALPHA 0x00200
#define MBFL_ZEN2HAN_NUMERIC 0x00400
#define MBFL_ZEN2HAN_SPACE 0x00800
#define MBFL_ZEN2HAN_KATAKANA 0x01000
#define MBFL_ZEN2HAN_HIRAGANA 0x02000
#define MBFL_ZEN2HAN_SPECIAL 0x04000
#define MBFL_ZENKAKU_KATA2HIRA 0x08000
#define MBFL_HAN2ZEN_GLUE 0x10000
extern const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208;
int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode);
#endif /* MBFILTER_TL_KANA_JISX0201_JISX0208_H */

View File

@ -25,6 +25,28 @@
#ifndef TRANSLIT_KANA_JISX0201_JISX0208_H
#define TRANSLIT_KANA_JISX0201_JISX0208_H
/* "Zen" is 全, or "full"; "Han" is 半, or "half"
* This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
#define MBFL_HAN2ZEN_ALL 0x00001
#define MBFL_HAN2ZEN_ALPHA 0x00002
#define MBFL_HAN2ZEN_NUMERIC 0x00004
#define MBFL_HAN2ZEN_SPACE 0x00008
#define MBFL_HAN2ZEN_KATAKANA 0x00010
#define MBFL_HAN2ZEN_HIRAGANA 0x00020
#define MBFL_HAN2ZEN_SPECIAL 0x00040
#define MBFL_ZENKAKU_HIRA2KATA 0x00080
#define MBFL_ZEN2HAN_ALL 0x00100
#define MBFL_ZEN2HAN_ALPHA 0x00200
#define MBFL_ZEN2HAN_NUMERIC 0x00400
#define MBFL_ZEN2HAN_SPACE 0x00800
#define MBFL_ZEN2HAN_KATAKANA 0x01000
#define MBFL_ZEN2HAN_HIRAGANA 0x02000
#define MBFL_ZEN2HAN_SPECIAL 0x04000
#define MBFL_ZENKAKU_KATA2HIRA 0x08000
#define MBFL_HAN2ZEN_GLUE 0x10000
static const unsigned char hankana2zenkana_table[64] = {
0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,

View File

@ -91,7 +91,6 @@
#include "filters/mbfilter_base64.h"
#include "filters/mbfilter_qprint.h"
#include "filters/mbfilter_singlebyte.h"
#include "filters/mbfilter_tl_jisx0201_jisx0208.h"
#include "filters/mbfilter_utf8.h"
#include "eaw_table.h"
@ -1391,86 +1390,6 @@ mbfl_strimwidth(
return result;
}
mbfl_string *
mbfl_ja_jp_hantozen(
mbfl_string *string,
mbfl_string *result,
int mode)
{
size_t n;
unsigned char *p;
mbfl_memory_device device;
mbfl_convert_filter *decoder = NULL;
mbfl_convert_filter *encoder = NULL;
mbfl_convert_filter *tl_filter = NULL;
mbfl_convert_filter *next_filter = NULL;
mbfl_memory_device_init(&device, string->len, 0);
mbfl_string_init(result);
result->encoding = string->encoding;
decoder = mbfl_convert_filter_new(
&mbfl_encoding_wchar,
string->encoding,
mbfl_memory_device_output, 0, &device);
if (decoder == NULL) {
goto out;
}
next_filter = decoder;
tl_filter = mbfl_convert_filter_new2(
&vtbl_tl_jisx0201_jisx0208,
(int(*)(int, void*))next_filter->filter_function,
(flush_function_t)next_filter->filter_flush,
next_filter);
if (tl_filter == NULL) {
goto out;
}
tl_filter->opaque = (void*)((intptr_t)mode);
next_filter = tl_filter;
encoder = mbfl_convert_filter_new(
string->encoding,
&mbfl_encoding_wchar,
(int(*)(int, void*))next_filter->filter_function,
(flush_function_t)next_filter->filter_flush,
next_filter);
if (encoder == NULL) {
goto out;
}
/* feed data */
p = string->val;
n = string->len;
if (p != NULL) {
while (n > 0) {
if ((*encoder->filter_function)(*p++, encoder) < 0) {
break;
}
n--;
}
}
mbfl_convert_filter_flush(encoder);
result = mbfl_memory_device_result(&device, result);
out:
if (tl_filter != NULL) {
mbfl_convert_filter_delete(tl_filter);
}
if (decoder != NULL) {
mbfl_convert_filter_delete(decoder);
}
if (encoder != NULL) {
mbfl_convert_filter_delete(encoder);
}
return result;
}
/*
* MIME header encode

View File

@ -288,10 +288,4 @@ mbfl_mime_header_decode(
mbfl_string *result,
const mbfl_encoding *outcode);
/*
* convert of halfwidth and fullwidth for japanese
*/
MBFLAPI extern mbfl_string *
mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, int mode);
#endif /* MBFL_MBFILTER_H */

View File

@ -40,8 +40,8 @@
#include "libmbfl/filters/mbfilter_uuencode.h"
#include "libmbfl/filters/mbfilter_ucs4.h"
#include "libmbfl/filters/mbfilter_utf8.h"
#include "libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h"
#include "libmbfl/filters/mbfilter_singlebyte.h"
#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
#include "php_variables.h"
#include "php_globals.h"
@ -2838,37 +2838,291 @@ PHP_FUNCTION(mb_decode_mimeheader)
}
/* }}} */
/* Apply various transforms to input codepoint, such as converting halfwidth katakana
* to fullwidth katakana. `mode` is a bitfield which controls which transforms are
* actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
* `mode` must not call for transforms which are inverses (i.e. which would cancel
* each other out).
*
* In some cases, successive input codepoints may be merged into one output codepoint.
* (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
* and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
* will not be modified. If there is no following codepoint, `next` should be zero.
*
* Again, in some cases, one input codepoint may convert to two output codepoints.
* If so, the second output codepoint will be stored in `*second`.
*
* Return the resulting codepoint. If none of the requested transforms apply, return
* the input codepoint unchanged.
*/
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
{
if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
return c + 0xFEE0;
}
if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
return c + 0xFEE0;
}
if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
return c + 0xFEE0;
}
if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
return 0x3000;
}
if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
/* Convert Hankaku kana to Zenkaku kana
* Either all Hankaku kana (including katakana and hiragana) will be converted
* to Zenkaku katakana, or to Zenkaku hiragana */
if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
if (c >= 0xFF61 && c <= 0xFF9F) {
int n = c - 0xFF60;
if (next >= 0xFF61 && next <= 0xFF9F) {
if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
*consumed = true;
return 0x3001 + hankana2zenkana_table[n];
}
if (next == 0xFF9E && n == 19) {
*consumed = true;
return 0x30F4;
}
if (next == 0xFF9F && n >= 42 && n <= 46) {
*consumed = true;
return 0x3002 + hankana2zenkana_table[n];
}
}
return 0x3000 + hankana2zenkana_table[n];
}
}
if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
if (c >= 0xFF61 && c <= 0xFF9F) {
int n = c - 0xFF60;
if (next >= 0xFF61 && next <= 0xFF9F) {
if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
*consumed = true;
return 0x3001 + hankana2zenhira_table[n];
}
if (next == 0xFF9F && n >= 42 && n <= 46) {
*consumed = true;
return 0x3002 + hankana2zenhira_table[n];
}
}
return 0x3000 + hankana2zenhira_table[n];
}
}
if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
return 0x3000 + hankana2zenkana_table[c - 0xFF60];
}
if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
return 0x3000 + hankana2zenhira_table[c - 0xFF60];
}
}
if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
if (c == '\\' || c == 0xA5) { /* YEN SIGN */
return 0xFFE5; /* FULLWIDTH YEN SIGN */
}
if (c == 0x7E || c == 0x203E) {
return 0xFFE3; /* FULLWIDTH MACRON */
}
if (c == '\'') {
return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
}
if (c == '"') {
return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
}
}
if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
/* Zenkaku to Hankaku */
if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
/* all except " ' \ ~ */
return c - 0xFEE0;
}
if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
return c - 0xFEE0;
}
if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
return c - 0xFEE0;
}
if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
return ' ';
}
if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
return '-';
}
}
if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
/* Zenkaku kana to hankaku kana */
if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
/* Zenkaku katakana to hankaku kana */
int n = c - 0x30A1;
if (zenkana2hankana_table[n][1]) {
*second = 0xFF00 + zenkana2hankana_table[n][1];
}
return 0xFF00 + zenkana2hankana_table[n][0];
}
if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
/* Zenkaku hiragana to hankaku kana */
int n = c - 0x3041;
if (zenkana2hankana_table[n][1]) {
*second = 0xFF00 + zenkana2hankana_table[n][1];
}
return 0xFF00 + zenkana2hankana_table[n][0];
}
if (c == 0x3001) {
return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
}
if (c == 0x3002) {
return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
}
if (c == 0x300C) {
return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
}
if (c == 0x300D) {
return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
}
if (c == 0x309B) {
return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
}
if (c == 0x309C) {
return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
}
if (c == 0x30FC) {
return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
}
if (c == 0x30FB) {
return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
}
}
if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
/* Zenkaku hiragana to Zenkaku katakana */
return c + 0x60;
}
if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
/* Zenkaku katakana to Zenkaku hiragana */
return c - 0x60;
}
}
if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
return '\\';
}
if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
return '~';
}
if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
return '\'';
}
if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
return '"';
}
}
return c;
}
static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
{
/* Each wchar may potentially expand to 2 when we perform kana conversion...
* if we are converting zenkaku kana to hankaku kana
* Make the buffer for converted kana big enough that we never need to
* perform bounds checks */
uint32_t wchar_buf[64], converted_buf[64 * 2];
unsigned int buf_offset = 0;
unsigned int state = 0;
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
size_t in_len = ZSTR_LEN(input);
mb_convert_buf buf;
mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
while (in_len) {
uint32_t *converted = converted_buf;
/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
* previous iteration, don't overwrite it */
size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
out_len += buf_offset;
ZEND_ASSERT(out_len <= 64);
if (!out_len) {
continue;
}
for (int i = 0; i < out_len-1; i++) {
uint32_t second = 0;
bool consumed = false;
*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
if (second) {
*converted++ = second;
}
if (consumed) {
i++;
if (i == out_len-1) {
/* We consumed two codepoints at the very end of the wchar buffer
* So there is nothing remaining to reprocess on the next iteration */
buf_offset = 0;
goto emit_converted_kana;
}
}
}
if (!in_len) {
/* This is the last iteration, so we need to process the final codepoint now */
uint32_t second = 0;
*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
if (second) {
*converted++ = second;
}
} else {
/* Reprocess the last codepoint on the next iteration */
wchar_buf[0] = wchar_buf[out_len-1];
buf_offset = 1;
}
emit_converted_kana:
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
}
return mb_convert_buf_result(&buf);
}
char mb_convert_kana_flags[17] = {
'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
'V'
};
/* {{{ Conversion between full-width character and half-width character (Japanese) */
/* Conversion between full-width characters and half-width characters (Japanese) */
PHP_FUNCTION(mb_convert_kana)
{
int opt;
mbfl_string string, result, *ret;
char *optstr = NULL, *string_val;
unsigned int opt;
char *optstr = NULL;
size_t optstr_len;
zend_string *encname = NULL;
zend_string *encname = NULL, *str;
ZEND_PARSE_PARAMETERS_START(1, 3)
Z_PARAM_STRING(string_val, string.len)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STRING(optstr, optstr_len)
Z_PARAM_STR_OR_NULL(encname)
ZEND_PARSE_PARAMETERS_END();
string.val = (unsigned char*)string_val;
if (optstr != NULL) {
char *p = optstr, *e = p + optstr_len;
opt = 0;
next_option:
while (p < e) {
/* Walk through option string and convert to bit vector
* See mbfilter_tl_jisx0201_jisx0208.h for the values used */
* See translit_kana_jisx0201_jisx0208.h for the values used */
char c = *p++;
if (c == 'A') {
opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
@ -2936,19 +3190,13 @@ next_option:
opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
}
/* encoding */
string.encoding = php_mb_get_encoding(encname, 3);
if (!string.encoding) {
const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
if (!enc) {
RETURN_THROWS();
}
ret = mbfl_ja_jp_hantozen(&string, &result, opt);
ZEND_ASSERT(ret != NULL);
// TODO: avoid reallocation ???
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
efree(ret->val);
RETVAL_STR(jp_kana_convert(str, enc, opt));
}
/* }}} */
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
{

View File

@ -132,6 +132,17 @@ try {
echo $e->getMessage() . "\n";
}
// Regression test: Two codepoints collapsed into one, just one position
// before the end of the string
$converted = mb_convert_kana("\xb9\xde\xde", 'HV', 'JIS');
if ($converted !== "\x1b\$B\$2!+\x1b(B")
echo "Failed! Expected " . bin2hex("\x1b\$B\$2!+\x1b(B") . ", got: " . bin2hex($converted) . "\n";
// Regression test: the old implementation of mb_convert_kana would swallow
// zero bytes in some cases
if (mb_convert_kana("abc\x00abc", 'c', 'ASCII') !== "abc\x00abc")
echo "mb_convert_kana is swallowing zero bytes!\n";
?>
--EXPECT--
'A': ァアィイゥウェエォオカガキギク => ァアィイゥウェエォオカガキギク