mirror of
https://github.com/php/php-src.git
synced 2024-11-27 03:44:07 +08:00
New implementation of mb_convert_kana
mb_convert_kana now uses the new text encoding conversion filters. Microbenchmarking shows speed gains of 50%-150% across various text encodings and input string lengths. The behavior is the same as the old mb_convert_kana except for one fix: if the 'zero codepoint' U+0000 appeared in the input, the old implementation would sometimes drop it, not passing it through to the output. This is now fixed.
This commit is contained in:
parent
840423dffa
commit
9ac49c0dd3
@ -118,7 +118,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [
|
||||
libmbfl/filters/mbfilter_sjis_mobile.c
|
||||
libmbfl/filters/mbfilter_sjis_mac.c
|
||||
libmbfl/filters/mbfilter_sjis_2004.c
|
||||
libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c
|
||||
libmbfl/filters/mbfilter_ucs2.c
|
||||
libmbfl/filters/mbfilter_ucs4.c
|
||||
libmbfl/filters/mbfilter_uhc.c
|
||||
|
@ -28,8 +28,7 @@ if (PHP_MBSTRING != "no") {
|
||||
mbfilter_utf8_mobile.c mbfilter_uuencode.c \
|
||||
mbfilter_cp5022x.c mbfilter_sjis_mobile.c \
|
||||
mbfilter_sjis_mac.c \
|
||||
mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \
|
||||
mbfilter_tl_jisx0201_jisx0208.c", "mbstring");
|
||||
mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c", "mbstring");
|
||||
|
||||
ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \
|
||||
mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \
|
||||
|
@ -25,11 +25,11 @@
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_cp5022x.h"
|
||||
#include "mbfilter_jis.h"
|
||||
#include "mbfilter_tl_jisx0201_jisx0208.h"
|
||||
|
||||
#include "unicode_table_cp932_ext.h"
|
||||
#include "unicode_table_jis.h"
|
||||
#include "cp932_table.h"
|
||||
#include "translit_kana_jisx0201_jisx0208.h"
|
||||
|
||||
static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
|
||||
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
|
||||
@ -40,6 +40,9 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
|
||||
static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
|
||||
/* See mbstring.c */
|
||||
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
|
||||
|
||||
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
|
||||
* This was just CP50220, but the implementation was less strict regarding
|
||||
* invalid characters; it would silently pass some through
|
||||
@ -336,7 +339,7 @@ static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
|
||||
bool consumed = false;
|
||||
|
||||
if (filter->cache) {
|
||||
int s = mbfl_convert_kana(filter->cache, c, &consumed, NULL, mode);
|
||||
int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
|
||||
filter->cache = consumed ? 0 : c;
|
||||
/* Terrible hack to get CP50220 to emit error markers in the proper
|
||||
* position, not reordering them with subsequent characters */
|
||||
@ -359,7 +362,7 @@ static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
|
||||
int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
|
||||
|
||||
if (filter->cache) {
|
||||
int s = mbfl_convert_kana(filter->cache, 0, NULL, NULL, mode);
|
||||
int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
|
||||
mbfl_filt_conv_wchar_cp50221(s, filter);
|
||||
filter->cache = 0;
|
||||
}
|
||||
@ -866,7 +869,7 @@ reprocess_codepoint:
|
||||
buf->state |= w << 8;
|
||||
break;
|
||||
} else {
|
||||
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
|
||||
w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
|
||||
}
|
||||
|
||||
if (consumed) {
|
||||
|
@ -1,252 +0,0 @@
|
||||
/*
|
||||
* "streamable kanji code filter and converter"
|
||||
* Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
|
||||
*
|
||||
* LICENSE NOTICES
|
||||
*
|
||||
* This file is part of "streamable kanji code filter and converter",
|
||||
* which is distributed under the terms of GNU Lesser General Public
|
||||
* License (version 2) as published by the Free Software Foundation.
|
||||
*
|
||||
* This software is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with "streamable kanji code filter and converter";
|
||||
* if not, write to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
* Suite 330, Boston, MA 02111-1307 USA
|
||||
*
|
||||
* The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "mbfilter_tl_jisx0201_jisx0208.h"
|
||||
#include "translit_kana_jisx0201_jisx0208.h"
|
||||
|
||||
/* Apply various transforms to input codepoint, such as converting halfwidth katakana
|
||||
* to fullwidth katakana. `mode` is a bitfield which controls which transforms are
|
||||
* actually performed. The bit values are defined in mbfilter_tl_jisx0201_jix0208.h.
|
||||
* `mode` must not call for transforms which are inverses (i.e. which would cancel
|
||||
* each other out).
|
||||
*
|
||||
* In some cases, successive input codepoints may be merged into one output codepoint.
|
||||
* (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
|
||||
* and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
|
||||
* will not be modified. If there is no following codepoint, `next` should be zero.
|
||||
*
|
||||
* Again, in some cases, one input codepoint may convert to two output codepoints.
|
||||
* If so, the second output codepoint will be stored in `*second`.
|
||||
*
|
||||
* Return the resulting codepoint. If none of the requested transforms apply, return
|
||||
* the input codepoint unchanged.
|
||||
*/
|
||||
int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
|
||||
{
|
||||
if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
|
||||
return c + 0xfee0;
|
||||
} else if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
|
||||
return c + 0xfee0;
|
||||
} else if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
|
||||
return c + 0xfee0;
|
||||
} else if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
|
||||
return 0x3000;
|
||||
}
|
||||
|
||||
if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
|
||||
/* Convert Hankaku kana to Zenkaku kana
|
||||
* Either all Hankaku kana (including katakana and hiragana) will be converted
|
||||
* to Zenkaku katakana, or to Zenkaku hiragana */
|
||||
if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
|
||||
if (c >= 0xff61 && c <= 0xff9f) {
|
||||
int n = c - 0xff60;
|
||||
if (next >= 0xff61 && next <= 0xff9f) {
|
||||
if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
|
||||
*consumed = true;
|
||||
return 0x3001 + hankana2zenkana_table[n];
|
||||
} else if (next == 0xff9e && n == 19) {
|
||||
*consumed = true;
|
||||
return 0x30f4;
|
||||
} else if (next == 0xff9f && n >= 42 && n <= 46) {
|
||||
*consumed = true;
|
||||
return 0x3002 + hankana2zenkana_table[n];
|
||||
}
|
||||
}
|
||||
|
||||
return 0x3000 + hankana2zenkana_table[n];
|
||||
}
|
||||
} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
|
||||
if (c >= 0xff61 && c <= 0xff9f) {
|
||||
int n = c - 0xff60;
|
||||
if (next >= 0xff61 && next <= 0xff9f) {
|
||||
if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
|
||||
*consumed = true;
|
||||
return 0x3001 + hankana2zenhira_table[n];
|
||||
} else if (next == 0xff9f && n >= 42 && n <= 46) {
|
||||
*consumed = true;
|
||||
return 0x3002 + hankana2zenhira_table[n];
|
||||
}
|
||||
}
|
||||
|
||||
return 0x3000 + hankana2zenhira_table[n];
|
||||
}
|
||||
} else if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
|
||||
return 0x3000 + hankana2zenkana_table[c - 0xff60];
|
||||
} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
|
||||
return 0x3000 + hankana2zenhira_table[c - 0xff60];
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
|
||||
if (c == 0x5c) {
|
||||
return 0xffe5; /* FULLWIDTH YEN SIGN */
|
||||
} else if (c == 0xa5) { /* YEN SIGN */
|
||||
return 0xffe5; /* FULLWIDTH YEN SIGN */
|
||||
} else if (c == 0x7e) {
|
||||
return 0xffe3; /* FULLWIDTH MACRON */
|
||||
} else if (c == 0x203e) { /* OVERLINE */
|
||||
return 0xffe3; /* FULLWIDTH MACRON */
|
||||
} else if (c == 0x27) {
|
||||
return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
|
||||
} else if (c == 0x22) {
|
||||
return 0x201d; /* RIGHT DOUBLE QUOTATION MARK */
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
|
||||
/* Zenkaku to Hankaku */
|
||||
if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c != 0xff3c) {
|
||||
/* all except " ' \ ~ */
|
||||
return c - 0xfee0;
|
||||
} else if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
|
||||
return c - 0xfee0;
|
||||
} else if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
|
||||
return c - 0xfee0;
|
||||
} else if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
|
||||
return 0x20;
|
||||
} else if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
|
||||
return 0x2d;
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
|
||||
/* Zenkaku kana to hankaku kana */
|
||||
if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
|
||||
/* Zenkaku katakana to hankaku kana */
|
||||
int n = c - 0x30a1;
|
||||
if (zenkana2hankana_table[n][1]) {
|
||||
*second = 0xff00 + zenkana2hankana_table[n][1];
|
||||
}
|
||||
return 0xff00 + zenkana2hankana_table[n][0];
|
||||
} else if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
|
||||
/* Zenkaku hiragana to hankaku kana */
|
||||
int n = c - 0x3041;
|
||||
if (zenkana2hankana_table[n][1]) {
|
||||
*second = 0xff00 + zenkana2hankana_table[n][1];
|
||||
}
|
||||
return 0xff00 + zenkana2hankana_table[n][0];
|
||||
} else if (c == 0x3001) {
|
||||
return 0xff64; /* HALFWIDTH IDEOGRAPHIC COMMA */
|
||||
} else if (c == 0x3002) {
|
||||
return 0xff61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
|
||||
} else if (c == 0x300c) {
|
||||
return 0xff62; /* HALFWIDTH LEFT CORNER BRACKET */
|
||||
} else if (c == 0x300d) {
|
||||
return 0xff63; /* HALFWIDTH RIGHT CORNER BRACKET */
|
||||
} else if (c == 0x309b) {
|
||||
return 0xff9e; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
|
||||
} else if (c == 0x309c) {
|
||||
return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
|
||||
} else if (c == 0x30fc) {
|
||||
return 0xff70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
|
||||
} else if (c == 0x30fb) {
|
||||
return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */
|
||||
}
|
||||
} else if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
|
||||
if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
|
||||
/* Zenkaku hiragana to Zenkaku katakana */
|
||||
return c + 0x60;
|
||||
} else if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
|
||||
/* Zenkaku katakana to Zenkaku hiragana */
|
||||
return c - 0x60;
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
|
||||
if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */
|
||||
return 0x5c;
|
||||
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
|
||||
return 0x5c;
|
||||
} else if (c == 0xffe3) { /* FULLWIDTH MACRON */
|
||||
return 0x7e;
|
||||
} else if (c == 0x203e) { /* OVERLINE */
|
||||
return 0x7e;
|
||||
} else if (c == 0x2018) { /* LEFT SINGLE QUOTATION MARK*/
|
||||
return 0x27;
|
||||
} else if (c == 0x2019) { /* RIGHT SINGLE QUOTATION MARK */
|
||||
return 0x27;
|
||||
} else if (c == 0x201c) { /* LEFT DOUBLE QUOTATION MARK */
|
||||
return 0x22;
|
||||
} else if (c == 0x201d) { /* RIGHT DOUBLE QUOTATION MARK */
|
||||
return 0x22;
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
int mbfl_filt_tl_jisx0201_jisx0208(int c, mbfl_convert_filter *filt)
|
||||
{
|
||||
int mode = (intptr_t)filt->opaque, second = 0;
|
||||
bool consumed = false;
|
||||
|
||||
if (filt->cache) {
|
||||
int s = mbfl_convert_kana(filt->cache, c, &consumed, &second, mode);
|
||||
filt->cache = consumed ? 0 : c;
|
||||
(*filt->output_function)(s, filt->data);
|
||||
if (second) {
|
||||
(*filt->output_function)(second, filt->data);
|
||||
}
|
||||
} else if (c == 0) {
|
||||
/* This case has to be handled separately, since `filt->cache == 0` means no
|
||||
* codepoint is cached */
|
||||
(*filt->output_function)(0, filt->data);
|
||||
} else {
|
||||
filt->cache = c;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter *filt)
|
||||
{
|
||||
int mode = (intptr_t)filt->opaque, second = 0;
|
||||
|
||||
if (filt->cache) {
|
||||
int s = mbfl_convert_kana(filt->cache, 0, NULL, &second, mode);
|
||||
(*filt->output_function)(s, filt->data);
|
||||
if (second) {
|
||||
(*filt->output_function)(second, filt->data);
|
||||
}
|
||||
filt->cache = 0;
|
||||
}
|
||||
|
||||
if (filt->flush_function) {
|
||||
return (*filt->flush_function)(filt->data);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208 = {
|
||||
mbfl_no_encoding_wchar,
|
||||
mbfl_no_encoding_wchar,
|
||||
mbfl_filt_conv_common_ctor,
|
||||
NULL,
|
||||
mbfl_filt_tl_jisx0201_jisx0208,
|
||||
mbfl_filt_tl_jisx0201_jisx0208_flush,
|
||||
NULL,
|
||||
};
|
@ -1,56 +0,0 @@
|
||||
/*
|
||||
* "streamable kanji code filter and converter"
|
||||
* Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
|
||||
*
|
||||
* LICENSE NOTICES
|
||||
*
|
||||
* This file is part of "streamable kanji code filter and converter",
|
||||
* which is distributed under the terms of GNU Lesser General Public
|
||||
* License (version 2) as published by the Free Software Foundation.
|
||||
*
|
||||
* This software is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with "streamable kanji code filter and converter";
|
||||
* if not, write to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
* Suite 330, Boston, MA 02111-1307 USA
|
||||
*
|
||||
* The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MBFILTER_TL_KANA_JISX0201_JISX0208_H
|
||||
#define MBFILTER_TL_KANA_JISX0201_JISX0208_H
|
||||
|
||||
#include "mbfl_convert.h"
|
||||
|
||||
/* "Zen" is 全, or "full"; "Han" is 半, or "half"
|
||||
* This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
|
||||
#define MBFL_HAN2ZEN_ALL 0x00001
|
||||
#define MBFL_HAN2ZEN_ALPHA 0x00002
|
||||
#define MBFL_HAN2ZEN_NUMERIC 0x00004
|
||||
#define MBFL_HAN2ZEN_SPACE 0x00008
|
||||
#define MBFL_HAN2ZEN_KATAKANA 0x00010
|
||||
#define MBFL_HAN2ZEN_HIRAGANA 0x00020
|
||||
#define MBFL_HAN2ZEN_SPECIAL 0x00040
|
||||
#define MBFL_ZENKAKU_HIRA2KATA 0x00080
|
||||
|
||||
#define MBFL_ZEN2HAN_ALL 0x00100
|
||||
#define MBFL_ZEN2HAN_ALPHA 0x00200
|
||||
#define MBFL_ZEN2HAN_NUMERIC 0x00400
|
||||
#define MBFL_ZEN2HAN_SPACE 0x00800
|
||||
#define MBFL_ZEN2HAN_KATAKANA 0x01000
|
||||
#define MBFL_ZEN2HAN_HIRAGANA 0x02000
|
||||
#define MBFL_ZEN2HAN_SPECIAL 0x04000
|
||||
#define MBFL_ZENKAKU_KATA2HIRA 0x08000
|
||||
|
||||
#define MBFL_HAN2ZEN_GLUE 0x10000
|
||||
|
||||
extern const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208;
|
||||
|
||||
int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode);
|
||||
|
||||
#endif /* MBFILTER_TL_KANA_JISX0201_JISX0208_H */
|
@ -25,6 +25,28 @@
|
||||
#ifndef TRANSLIT_KANA_JISX0201_JISX0208_H
|
||||
#define TRANSLIT_KANA_JISX0201_JISX0208_H
|
||||
|
||||
/* "Zen" is 全, or "full"; "Han" is 半, or "half"
|
||||
* This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
|
||||
#define MBFL_HAN2ZEN_ALL 0x00001
|
||||
#define MBFL_HAN2ZEN_ALPHA 0x00002
|
||||
#define MBFL_HAN2ZEN_NUMERIC 0x00004
|
||||
#define MBFL_HAN2ZEN_SPACE 0x00008
|
||||
#define MBFL_HAN2ZEN_KATAKANA 0x00010
|
||||
#define MBFL_HAN2ZEN_HIRAGANA 0x00020
|
||||
#define MBFL_HAN2ZEN_SPECIAL 0x00040
|
||||
#define MBFL_ZENKAKU_HIRA2KATA 0x00080
|
||||
|
||||
#define MBFL_ZEN2HAN_ALL 0x00100
|
||||
#define MBFL_ZEN2HAN_ALPHA 0x00200
|
||||
#define MBFL_ZEN2HAN_NUMERIC 0x00400
|
||||
#define MBFL_ZEN2HAN_SPACE 0x00800
|
||||
#define MBFL_ZEN2HAN_KATAKANA 0x01000
|
||||
#define MBFL_ZEN2HAN_HIRAGANA 0x02000
|
||||
#define MBFL_ZEN2HAN_SPECIAL 0x04000
|
||||
#define MBFL_ZENKAKU_KATA2HIRA 0x08000
|
||||
|
||||
#define MBFL_HAN2ZEN_GLUE 0x10000
|
||||
|
||||
static const unsigned char hankana2zenkana_table[64] = {
|
||||
0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
|
||||
0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,
|
||||
|
@ -91,7 +91,6 @@
|
||||
#include "filters/mbfilter_base64.h"
|
||||
#include "filters/mbfilter_qprint.h"
|
||||
#include "filters/mbfilter_singlebyte.h"
|
||||
#include "filters/mbfilter_tl_jisx0201_jisx0208.h"
|
||||
#include "filters/mbfilter_utf8.h"
|
||||
|
||||
#include "eaw_table.h"
|
||||
@ -1391,86 +1390,6 @@ mbfl_strimwidth(
|
||||
return result;
|
||||
}
|
||||
|
||||
mbfl_string *
|
||||
mbfl_ja_jp_hantozen(
|
||||
mbfl_string *string,
|
||||
mbfl_string *result,
|
||||
int mode)
|
||||
{
|
||||
size_t n;
|
||||
unsigned char *p;
|
||||
mbfl_memory_device device;
|
||||
mbfl_convert_filter *decoder = NULL;
|
||||
mbfl_convert_filter *encoder = NULL;
|
||||
mbfl_convert_filter *tl_filter = NULL;
|
||||
mbfl_convert_filter *next_filter = NULL;
|
||||
|
||||
mbfl_memory_device_init(&device, string->len, 0);
|
||||
mbfl_string_init(result);
|
||||
|
||||
result->encoding = string->encoding;
|
||||
|
||||
decoder = mbfl_convert_filter_new(
|
||||
&mbfl_encoding_wchar,
|
||||
string->encoding,
|
||||
mbfl_memory_device_output, 0, &device);
|
||||
if (decoder == NULL) {
|
||||
goto out;
|
||||
}
|
||||
next_filter = decoder;
|
||||
|
||||
tl_filter = mbfl_convert_filter_new2(
|
||||
&vtbl_tl_jisx0201_jisx0208,
|
||||
(int(*)(int, void*))next_filter->filter_function,
|
||||
(flush_function_t)next_filter->filter_flush,
|
||||
next_filter);
|
||||
if (tl_filter == NULL) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
tl_filter->opaque = (void*)((intptr_t)mode);
|
||||
next_filter = tl_filter;
|
||||
|
||||
encoder = mbfl_convert_filter_new(
|
||||
string->encoding,
|
||||
&mbfl_encoding_wchar,
|
||||
(int(*)(int, void*))next_filter->filter_function,
|
||||
(flush_function_t)next_filter->filter_flush,
|
||||
next_filter);
|
||||
if (encoder == NULL) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* feed data */
|
||||
p = string->val;
|
||||
n = string->len;
|
||||
if (p != NULL) {
|
||||
while (n > 0) {
|
||||
if ((*encoder->filter_function)(*p++, encoder) < 0) {
|
||||
break;
|
||||
}
|
||||
n--;
|
||||
}
|
||||
}
|
||||
|
||||
mbfl_convert_filter_flush(encoder);
|
||||
result = mbfl_memory_device_result(&device, result);
|
||||
out:
|
||||
if (tl_filter != NULL) {
|
||||
mbfl_convert_filter_delete(tl_filter);
|
||||
}
|
||||
|
||||
if (decoder != NULL) {
|
||||
mbfl_convert_filter_delete(decoder);
|
||||
}
|
||||
|
||||
if (encoder != NULL) {
|
||||
mbfl_convert_filter_delete(encoder);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* MIME header encode
|
||||
|
@ -288,10 +288,4 @@ mbfl_mime_header_decode(
|
||||
mbfl_string *result,
|
||||
const mbfl_encoding *outcode);
|
||||
|
||||
/*
|
||||
* convert of halfwidth and fullwidth for japanese
|
||||
*/
|
||||
MBFLAPI extern mbfl_string *
|
||||
mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, int mode);
|
||||
|
||||
#endif /* MBFL_MBFILTER_H */
|
||||
|
@ -40,8 +40,8 @@
|
||||
#include "libmbfl/filters/mbfilter_uuencode.h"
|
||||
#include "libmbfl/filters/mbfilter_ucs4.h"
|
||||
#include "libmbfl/filters/mbfilter_utf8.h"
|
||||
#include "libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h"
|
||||
#include "libmbfl/filters/mbfilter_singlebyte.h"
|
||||
#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
|
||||
|
||||
#include "php_variables.h"
|
||||
#include "php_globals.h"
|
||||
@ -2838,37 +2838,291 @@ PHP_FUNCTION(mb_decode_mimeheader)
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
/* Apply various transforms to input codepoint, such as converting halfwidth katakana
|
||||
* to fullwidth katakana. `mode` is a bitfield which controls which transforms are
|
||||
* actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
|
||||
* `mode` must not call for transforms which are inverses (i.e. which would cancel
|
||||
* each other out).
|
||||
*
|
||||
* In some cases, successive input codepoints may be merged into one output codepoint.
|
||||
* (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
|
||||
* and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
|
||||
* will not be modified. If there is no following codepoint, `next` should be zero.
|
||||
*
|
||||
* Again, in some cases, one input codepoint may convert to two output codepoints.
|
||||
* If so, the second output codepoint will be stored in `*second`.
|
||||
*
|
||||
* Return the resulting codepoint. If none of the requested transforms apply, return
|
||||
* the input codepoint unchanged.
|
||||
*/
|
||||
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
|
||||
{
|
||||
if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
|
||||
return c + 0xFEE0;
|
||||
}
|
||||
if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
|
||||
return c + 0xFEE0;
|
||||
}
|
||||
if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
|
||||
return c + 0xFEE0;
|
||||
}
|
||||
if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
|
||||
return 0x3000;
|
||||
}
|
||||
|
||||
if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
|
||||
/* Convert Hankaku kana to Zenkaku kana
|
||||
* Either all Hankaku kana (including katakana and hiragana) will be converted
|
||||
* to Zenkaku katakana, or to Zenkaku hiragana */
|
||||
if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
|
||||
if (c >= 0xFF61 && c <= 0xFF9F) {
|
||||
int n = c - 0xFF60;
|
||||
|
||||
if (next >= 0xFF61 && next <= 0xFF9F) {
|
||||
if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
|
||||
*consumed = true;
|
||||
return 0x3001 + hankana2zenkana_table[n];
|
||||
}
|
||||
if (next == 0xFF9E && n == 19) {
|
||||
*consumed = true;
|
||||
return 0x30F4;
|
||||
}
|
||||
if (next == 0xFF9F && n >= 42 && n <= 46) {
|
||||
*consumed = true;
|
||||
return 0x3002 + hankana2zenkana_table[n];
|
||||
}
|
||||
}
|
||||
|
||||
return 0x3000 + hankana2zenkana_table[n];
|
||||
}
|
||||
}
|
||||
if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
|
||||
if (c >= 0xFF61 && c <= 0xFF9F) {
|
||||
int n = c - 0xFF60;
|
||||
|
||||
if (next >= 0xFF61 && next <= 0xFF9F) {
|
||||
if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
|
||||
*consumed = true;
|
||||
return 0x3001 + hankana2zenhira_table[n];
|
||||
}
|
||||
if (next == 0xFF9F && n >= 42 && n <= 46) {
|
||||
*consumed = true;
|
||||
return 0x3002 + hankana2zenhira_table[n];
|
||||
}
|
||||
}
|
||||
|
||||
return 0x3000 + hankana2zenhira_table[n];
|
||||
}
|
||||
}
|
||||
if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
|
||||
return 0x3000 + hankana2zenkana_table[c - 0xFF60];
|
||||
}
|
||||
if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
|
||||
return 0x3000 + hankana2zenhira_table[c - 0xFF60];
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
|
||||
if (c == '\\' || c == 0xA5) { /* YEN SIGN */
|
||||
return 0xFFE5; /* FULLWIDTH YEN SIGN */
|
||||
}
|
||||
if (c == 0x7E || c == 0x203E) {
|
||||
return 0xFFE3; /* FULLWIDTH MACRON */
|
||||
}
|
||||
if (c == '\'') {
|
||||
return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
|
||||
}
|
||||
if (c == '"') {
|
||||
return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
|
||||
/* Zenkaku to Hankaku */
|
||||
if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
|
||||
/* all except " ' \ ~ */
|
||||
return c - 0xFEE0;
|
||||
}
|
||||
if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
|
||||
return c - 0xFEE0;
|
||||
}
|
||||
if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
|
||||
return c - 0xFEE0;
|
||||
}
|
||||
if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
|
||||
return ' ';
|
||||
}
|
||||
if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
|
||||
return '-';
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
|
||||
/* Zenkaku kana to hankaku kana */
|
||||
if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
|
||||
/* Zenkaku katakana to hankaku kana */
|
||||
int n = c - 0x30A1;
|
||||
if (zenkana2hankana_table[n][1]) {
|
||||
*second = 0xFF00 + zenkana2hankana_table[n][1];
|
||||
}
|
||||
return 0xFF00 + zenkana2hankana_table[n][0];
|
||||
}
|
||||
if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
|
||||
/* Zenkaku hiragana to hankaku kana */
|
||||
int n = c - 0x3041;
|
||||
if (zenkana2hankana_table[n][1]) {
|
||||
*second = 0xFF00 + zenkana2hankana_table[n][1];
|
||||
}
|
||||
return 0xFF00 + zenkana2hankana_table[n][0];
|
||||
}
|
||||
if (c == 0x3001) {
|
||||
return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
|
||||
}
|
||||
if (c == 0x3002) {
|
||||
return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
|
||||
}
|
||||
if (c == 0x300C) {
|
||||
return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
|
||||
}
|
||||
if (c == 0x300D) {
|
||||
return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
|
||||
}
|
||||
if (c == 0x309B) {
|
||||
return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
|
||||
}
|
||||
if (c == 0x309C) {
|
||||
return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
|
||||
}
|
||||
if (c == 0x30FC) {
|
||||
return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
|
||||
}
|
||||
if (c == 0x30FB) {
|
||||
return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
|
||||
if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
|
||||
/* Zenkaku hiragana to Zenkaku katakana */
|
||||
return c + 0x60;
|
||||
}
|
||||
if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
|
||||
/* Zenkaku katakana to Zenkaku hiragana */
|
||||
return c - 0x60;
|
||||
}
|
||||
}
|
||||
|
||||
if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
|
||||
if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
|
||||
return '\\';
|
||||
}
|
||||
if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
|
||||
return '~';
|
||||
}
|
||||
if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
|
||||
return '\'';
|
||||
}
|
||||
if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
|
||||
return '"';
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
|
||||
{
|
||||
/* Each wchar may potentially expand to 2 when we perform kana conversion...
|
||||
* if we are converting zenkaku kana to hankaku kana
|
||||
* Make the buffer for converted kana big enough that we never need to
|
||||
* perform bounds checks */
|
||||
uint32_t wchar_buf[64], converted_buf[64 * 2];
|
||||
unsigned int buf_offset = 0;
|
||||
unsigned int state = 0;
|
||||
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
|
||||
size_t in_len = ZSTR_LEN(input);
|
||||
|
||||
mb_convert_buf buf;
|
||||
mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
|
||||
|
||||
while (in_len) {
|
||||
uint32_t *converted = converted_buf;
|
||||
/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
|
||||
* previous iteration, don't overwrite it */
|
||||
size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
|
||||
out_len += buf_offset;
|
||||
ZEND_ASSERT(out_len <= 64);
|
||||
|
||||
if (!out_len) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < out_len-1; i++) {
|
||||
uint32_t second = 0;
|
||||
bool consumed = false;
|
||||
*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
|
||||
if (second) {
|
||||
*converted++ = second;
|
||||
}
|
||||
if (consumed) {
|
||||
i++;
|
||||
if (i == out_len-1) {
|
||||
/* We consumed two codepoints at the very end of the wchar buffer
|
||||
* So there is nothing remaining to reprocess on the next iteration */
|
||||
buf_offset = 0;
|
||||
goto emit_converted_kana;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!in_len) {
|
||||
/* This is the last iteration, so we need to process the final codepoint now */
|
||||
uint32_t second = 0;
|
||||
*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
|
||||
if (second) {
|
||||
*converted++ = second;
|
||||
}
|
||||
} else {
|
||||
/* Reprocess the last codepoint on the next iteration */
|
||||
wchar_buf[0] = wchar_buf[out_len-1];
|
||||
buf_offset = 1;
|
||||
}
|
||||
|
||||
emit_converted_kana:
|
||||
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
|
||||
}
|
||||
|
||||
return mb_convert_buf_result(&buf);
|
||||
}
|
||||
|
||||
char mb_convert_kana_flags[17] = {
|
||||
'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
|
||||
'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
|
||||
'V'
|
||||
};
|
||||
|
||||
/* {{{ Conversion between full-width character and half-width character (Japanese) */
|
||||
/* Conversion between full-width characters and half-width characters (Japanese) */
|
||||
PHP_FUNCTION(mb_convert_kana)
|
||||
{
|
||||
int opt;
|
||||
mbfl_string string, result, *ret;
|
||||
char *optstr = NULL, *string_val;
|
||||
unsigned int opt;
|
||||
char *optstr = NULL;
|
||||
size_t optstr_len;
|
||||
zend_string *encname = NULL;
|
||||
zend_string *encname = NULL, *str;
|
||||
|
||||
ZEND_PARSE_PARAMETERS_START(1, 3)
|
||||
Z_PARAM_STRING(string_val, string.len)
|
||||
Z_PARAM_STR(str)
|
||||
Z_PARAM_OPTIONAL
|
||||
Z_PARAM_STRING(optstr, optstr_len)
|
||||
Z_PARAM_STR_OR_NULL(encname)
|
||||
ZEND_PARSE_PARAMETERS_END();
|
||||
|
||||
string.val = (unsigned char*)string_val;
|
||||
|
||||
if (optstr != NULL) {
|
||||
char *p = optstr, *e = p + optstr_len;
|
||||
opt = 0;
|
||||
next_option:
|
||||
while (p < e) {
|
||||
/* Walk through option string and convert to bit vector
|
||||
* See mbfilter_tl_jisx0201_jisx0208.h for the values used */
|
||||
* See translit_kana_jisx0201_jisx0208.h for the values used */
|
||||
char c = *p++;
|
||||
if (c == 'A') {
|
||||
opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
|
||||
@ -2936,19 +3190,13 @@ next_option:
|
||||
opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
|
||||
}
|
||||
|
||||
/* encoding */
|
||||
string.encoding = php_mb_get_encoding(encname, 3);
|
||||
if (!string.encoding) {
|
||||
const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
|
||||
if (!enc) {
|
||||
RETURN_THROWS();
|
||||
}
|
||||
|
||||
ret = mbfl_ja_jp_hantozen(&string, &result, opt);
|
||||
ZEND_ASSERT(ret != NULL);
|
||||
// TODO: avoid reallocation ???
|
||||
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
|
||||
efree(ret->val);
|
||||
RETVAL_STR(jp_kana_convert(str, enc, opt));
|
||||
}
|
||||
/* }}} */
|
||||
|
||||
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
|
||||
{
|
||||
|
@ -132,6 +132,17 @@ try {
|
||||
echo $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
// Regression test: Two codepoints collapsed into one, just one position
|
||||
// before the end of the string
|
||||
$converted = mb_convert_kana("\xb9\xde\xde", 'HV', 'JIS');
|
||||
if ($converted !== "\x1b\$B\$2!+\x1b(B")
|
||||
echo "Failed! Expected " . bin2hex("\x1b\$B\$2!+\x1b(B") . ", got: " . bin2hex($converted) . "\n";
|
||||
|
||||
// Regression test: the old implementation of mb_convert_kana would swallow
|
||||
// zero bytes in some cases
|
||||
if (mb_convert_kana("abc\x00abc", 'c', 'ASCII') !== "abc\x00abc")
|
||||
echo "mb_convert_kana is swallowing zero bytes!\n";
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
'A': ァアィイゥウェエォオカガキギク => ァアィイゥウェエォオカガキギク
|
||||
|
Loading…
Reference in New Issue
Block a user