New implementation of mb_convert_kana

mb_convert_kana now uses the new text encoding conversion filters. Microbenchmarking shows speed gains of 50%-150% across various text encodings and input string lengths. The behavior is the same as the old mb_convert_kana except for one fix: if the 'zero codepoint' U+0000 appeared in the input, the old implementation would sometimes drop it, not passing it through to the output. This is now fixed.
2024-11-27 03:44:07 +08:00 · 2022-07-12 21:00:35 +02:00 · 2022-07-12 21:00:35 +02:00 · 9ac49c0dd3
commit 9ac49c0dd3
parent 840423dffa
10 changed files with 308 additions and 421 deletions
--- a/ext/mbstring/config.m4
+++ b/ext/mbstring/config.m4
@ -118,7 +118,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [
    libmbfl/filters/mbfilter_sjis_mobile.c
    libmbfl/filters/mbfilter_sjis_mac.c
    libmbfl/filters/mbfilter_sjis_2004.c
-    libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c
    libmbfl/filters/mbfilter_ucs2.c
    libmbfl/filters/mbfilter_ucs4.c
    libmbfl/filters/mbfilter_uhc.c
--- a/ext/mbstring/config.w32
+++ b/ext/mbstring/config.w32
@ -28,8 +28,7 @@ if (PHP_MBSTRING != "no") {
 			mbfilter_utf8_mobile.c mbfilter_uuencode.c \
 			mbfilter_cp5022x.c mbfilter_sjis_mobile.c \
 			mbfilter_sjis_mac.c \
-			mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \
-			mbfilter_tl_jisx0201_jisx0208.c", "mbstring");
+			mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c", "mbstring");

 		ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \
 			mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
@ -25,11 +25,11 @@
 #include "mbfilter.h"
 #include "mbfilter_cp5022x.h"
 #include "mbfilter_jis.h"
-#include "mbfilter_tl_jisx0201_jisx0208.h"

 #include "unicode_table_cp932_ext.h"
 #include "unicode_table_jis.h"
 #include "cp932_table.h"
+#include "translit_kana_jisx0201_jisx0208.h"

 static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
 static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
@ -40,6 +40,9 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
 static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
 static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);

+/* See mbstring.c */
+uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
+
 /* Previously, a dubious 'encoding' called 'cp50220raw' was supported
 * This was just CP50220, but the implementation was less strict regarding
 * invalid characters; it would silently pass some through
@ -336,7 +339,7 @@ static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
 	bool consumed = false;

 	if (filter->cache) {
-		int s = mbfl_convert_kana(filter->cache, c, &consumed, NULL, mode);
+		int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
 		filter->cache = consumed ? 0 : c;
 		/* Terrible hack to get CP50220 to emit error markers in the proper
 		 * position, not reordering them with subsequent characters */
@ -359,7 +362,7 @@ static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
 	int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;

 	if (filter->cache) {
-		int s = mbfl_convert_kana(filter->cache, 0, NULL, NULL, mode);
+		int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
 		mbfl_filt_conv_wchar_cp50221(s, filter);
 		filter->cache = 0;
 	}
@ -866,7 +869,7 @@ reprocess_codepoint:
 			buf->state |= w << 8;
 			break;
 		} else {
-			w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
+			w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
 		}

 		if (consumed) {
--- a/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c
@ -1,252 +0,0 @@
-/*
- * "streamable kanji code filter and converter"
- * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
- *
- * LICENSE NOTICES
- *
- * This file is part of "streamable kanji code filter and converter",
- * which is distributed under the terms of GNU Lesser General Public
- * License (version 2) as published by the Free Software Foundation.
- *
- * This software is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with "streamable kanji code filter and converter";
- * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
- * Suite 330, Boston, MA  02111-1307  USA
- *
- * The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
- *
- */
-
-#include <stdint.h>
-#include <stdbool.h>
-#include "mbfilter_tl_jisx0201_jisx0208.h"
-#include "translit_kana_jisx0201_jisx0208.h"
-
-/* Apply various transforms to input codepoint, such as converting halfwidth katakana
- * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
- * actually performed. The bit values are defined in mbfilter_tl_jisx0201_jix0208.h.
- * `mode` must not call for transforms which are inverses (i.e. which would cancel
- * each other out).
- *
- * In some cases, successive input codepoints may be merged into one output codepoint.
- * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
- * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
- * will not be modified. If there is no following codepoint, `next` should be zero.
- *
- * Again, in some cases, one input codepoint may convert to two output codepoints.
- * If so, the second output codepoint will be stored in `*second`.
- *
- * Return the resulting codepoint. If none of the requested transforms apply, return
- * the input codepoint unchanged.
- */
-int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
-{
-	if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
-		return c + 0xfee0;
-	} else if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
-		return c + 0xfee0;
-	} else if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
-		return c + 0xfee0;
-	} else if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
-		return 0x3000;
-	}
-
-	if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
-		/* Convert Hankaku kana to Zenkaku kana
-		 * Either all Hankaku kana (including katakana and hiragana) will be converted
-		 * to Zenkaku katakana, or to Zenkaku hiragana */
-		if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
-			if (c >= 0xff61 && c <= 0xff9f) {
-				int n = c - 0xff60;
-				if (next >= 0xff61 && next <= 0xff9f) {
-					if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
-						*consumed = true;
-						return 0x3001 + hankana2zenkana_table[n];
-					} else if (next == 0xff9e && n == 19) {
-						*consumed = true;
-						return 0x30f4;
-					} else if (next == 0xff9f && n >= 42 && n <= 46) {
-						*consumed = true;
-						return 0x3002 + hankana2zenkana_table[n];
-					}
-				}
-
-				return 0x3000 + hankana2zenkana_table[n];
-			}
-		} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
-			if (c >= 0xff61 && c <= 0xff9f) {
-				int n = c - 0xff60;
-				if (next >= 0xff61 && next <= 0xff9f) {
-					if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
-						*consumed = true;
-						return 0x3001 + hankana2zenhira_table[n];
-					} else if (next == 0xff9f && n >= 42 && n <= 46) {
-						*consumed = true;
-						return 0x3002 + hankana2zenhira_table[n];
-					}
-				}
-
-				return 0x3000 + hankana2zenhira_table[n];
-			}
-		} else if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
-			return 0x3000 + hankana2zenkana_table[c - 0xff60];
-		} else if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
-			return 0x3000 + hankana2zenhira_table[c - 0xff60];
-		}
-	}
-
-	if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
-		if (c == 0x5c) {
-			return 0xffe5; /* FULLWIDTH YEN SIGN */
-		} else if (c == 0xa5) { /* YEN SIGN */
-			return 0xffe5; /* FULLWIDTH YEN SIGN */
-		} else if (c == 0x7e) {
-			return 0xffe3; /* FULLWIDTH MACRON */
-		} else if (c == 0x203e) { /* OVERLINE */
-			return 0xffe3; /* FULLWIDTH MACRON */
-		} else if (c == 0x27) {
-			return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
-		} else if (c == 0x22) {
-			return 0x201d; /* RIGHT DOUBLE QUOTATION MARK */
-		}
-	}
-
-	if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
-		/* Zenkaku to Hankaku */
-		if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c != 0xff3c) {
-			/* all except " ' \ ~ */
-			return c - 0xfee0;
-		} else if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
-			return c - 0xfee0;
-		} else if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
-			return c - 0xfee0;
-		} else if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
-			return 0x20;
-		} else if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
-			return 0x2d;
-		}
-	}
-
-	if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
-		/* Zenkaku kana to hankaku kana */
-		if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
-			/* Zenkaku katakana to hankaku kana */
-			int n = c - 0x30a1;
-			if (zenkana2hankana_table[n][1]) {
-				*second = 0xff00 + zenkana2hankana_table[n][1];
-			}
-			return 0xff00 + zenkana2hankana_table[n][0];
-		} else if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
-			/* Zenkaku hiragana to hankaku kana */
-			int n = c - 0x3041;
-			if (zenkana2hankana_table[n][1]) {
-				*second = 0xff00 + zenkana2hankana_table[n][1];
-			}
-			return 0xff00 + zenkana2hankana_table[n][0];
-		} else if (c == 0x3001) {
-			return 0xff64; /* HALFWIDTH IDEOGRAPHIC COMMA */
-		} else if (c == 0x3002) {
-			return 0xff61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
-		} else if (c == 0x300c) {
-			return 0xff62; /* HALFWIDTH LEFT CORNER BRACKET */
-		} else if (c == 0x300d) {
-			return 0xff63; /* HALFWIDTH RIGHT CORNER BRACKET */
-		} else if (c == 0x309b) {
-			return 0xff9e; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
-		} else if (c == 0x309c) {
-			return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
-		} else if (c == 0x30fc) {
-			return 0xff70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
-		} else if (c == 0x30fb) {
-			return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */
-		}
-	} else if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
-		if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
-			/* Zenkaku hiragana to Zenkaku katakana */
-			return c + 0x60;
-		} else if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
-			/* Zenkaku katakana to Zenkaku hiragana */
-			return c - 0x60;
-		}
-	}
-
-	if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
-		if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */
-			return 0x5c;
-		} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
-			return 0x5c;
-		} else if (c == 0xffe3) { /* FULLWIDTH MACRON */
-			return 0x7e;
-		} else if (c == 0x203e) { /* OVERLINE */
-			return 0x7e;
-		} else if (c == 0x2018) { /* LEFT SINGLE QUOTATION MARK*/
-			return 0x27;
-		} else if (c == 0x2019) { /* RIGHT SINGLE QUOTATION MARK */
-			return 0x27;
-		} else if (c == 0x201c) { /* LEFT DOUBLE QUOTATION MARK */
-			return 0x22;
-		} else if (c == 0x201d) { /* RIGHT DOUBLE QUOTATION MARK */
-			return 0x22;
-		}
-	}
-
-	return c;
-}
-
-int mbfl_filt_tl_jisx0201_jisx0208(int c, mbfl_convert_filter *filt)
-{
-	int mode = (intptr_t)filt->opaque, second = 0;
-	bool consumed = false;
-
-	if (filt->cache) {
-		int s = mbfl_convert_kana(filt->cache, c, &consumed, &second, mode);
-		filt->cache = consumed ? 0 : c;
-		(*filt->output_function)(s, filt->data);
-		if (second) {
-			(*filt->output_function)(second, filt->data);
-		}
-	} else if (c == 0) {
-		/* This case has to be handled separately, since `filt->cache == 0` means no
-		 * codepoint is cached */
-		(*filt->output_function)(0, filt->data);
-	} else {
-		filt->cache = c;
-	}
-
-	return 0;
-}
-
-int mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter *filt)
-{
-	int mode = (intptr_t)filt->opaque, second = 0;
-
-	if (filt->cache) {
-		int s = mbfl_convert_kana(filt->cache, 0, NULL, &second, mode);
-		(*filt->output_function)(s, filt->data);
-		if (second) {
-			(*filt->output_function)(second, filt->data);
-		}
-		filt->cache = 0;
-	}
-
-	if (filt->flush_function) {
-		return (*filt->flush_function)(filt->data);
-	}
-
-	return 0;
-}
-
-const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208 = {
-	mbfl_no_encoding_wchar,
-	mbfl_no_encoding_wchar,
-	mbfl_filt_conv_common_ctor,
-	NULL,
-	mbfl_filt_tl_jisx0201_jisx0208,
-	mbfl_filt_tl_jisx0201_jisx0208_flush,
-	NULL,
-};
--- a/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h
+++ b/ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h
@ -1,56 +0,0 @@
-/*
- * "streamable kanji code filter and converter"
- * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
- *
- * LICENSE NOTICES
- *
- * This file is part of "streamable kanji code filter and converter",
- * which is distributed under the terms of GNU Lesser General Public
- * License (version 2) as published by the Free Software Foundation.
- *
- * This software is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with "streamable kanji code filter and converter";
- * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
- * Suite 330, Boston, MA  02111-1307  USA
- *
- * The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
- *
- */
-
-#ifndef MBFILTER_TL_KANA_JISX0201_JISX0208_H
-#define MBFILTER_TL_KANA_JISX0201_JISX0208_H
-
-#include "mbfl_convert.h"
-
-/* "Zen" is 全, or "full"; "Han" is 半, or "half"
- * This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
-#define MBFL_HAN2ZEN_ALL       0x00001
-#define MBFL_HAN2ZEN_ALPHA     0x00002
-#define MBFL_HAN2ZEN_NUMERIC   0x00004
-#define MBFL_HAN2ZEN_SPACE     0x00008
-#define MBFL_HAN2ZEN_KATAKANA  0x00010
-#define MBFL_HAN2ZEN_HIRAGANA  0x00020
-#define MBFL_HAN2ZEN_SPECIAL   0x00040
-#define MBFL_ZENKAKU_HIRA2KATA 0x00080
-
-#define MBFL_ZEN2HAN_ALL       0x00100
-#define MBFL_ZEN2HAN_ALPHA     0x00200
-#define MBFL_ZEN2HAN_NUMERIC   0x00400
-#define MBFL_ZEN2HAN_SPACE     0x00800
-#define MBFL_ZEN2HAN_KATAKANA  0x01000
-#define MBFL_ZEN2HAN_HIRAGANA  0x02000
-#define MBFL_ZEN2HAN_SPECIAL   0x04000
-#define MBFL_ZENKAKU_KATA2HIRA 0x08000
-
-#define MBFL_HAN2ZEN_GLUE      0x10000
-
-extern const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208;
-
-int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode);
-
-#endif /* MBFILTER_TL_KANA_JISX0201_JISX0208_H */
--- a/ext/mbstring/libmbfl/filters/translit_kana_jisx0201_jisx0208.h
+++ b/ext/mbstring/libmbfl/filters/translit_kana_jisx0201_jisx0208.h
@ -25,6 +25,28 @@
 #ifndef TRANSLIT_KANA_JISX0201_JISX0208_H
 #define TRANSLIT_KANA_JISX0201_JISX0208_H

+/* "Zen" is 全, or "full"; "Han" is 半, or "half"
+ * This refers to "fullwidth" or "halfwidth" variants of characters used for writing Japanese */
+#define MBFL_HAN2ZEN_ALL       0x00001
+#define MBFL_HAN2ZEN_ALPHA     0x00002
+#define MBFL_HAN2ZEN_NUMERIC   0x00004
+#define MBFL_HAN2ZEN_SPACE     0x00008
+#define MBFL_HAN2ZEN_KATAKANA  0x00010
+#define MBFL_HAN2ZEN_HIRAGANA  0x00020
+#define MBFL_HAN2ZEN_SPECIAL   0x00040
+#define MBFL_ZENKAKU_HIRA2KATA 0x00080
+
+#define MBFL_ZEN2HAN_ALL       0x00100
+#define MBFL_ZEN2HAN_ALPHA     0x00200
+#define MBFL_ZEN2HAN_NUMERIC   0x00400
+#define MBFL_ZEN2HAN_SPACE     0x00800
+#define MBFL_ZEN2HAN_KATAKANA  0x01000
+#define MBFL_ZEN2HAN_HIRAGANA  0x02000
+#define MBFL_ZEN2HAN_SPECIAL   0x04000
+#define MBFL_ZENKAKU_KATA2HIRA 0x08000
+
+#define MBFL_HAN2ZEN_GLUE      0x10000
+
 static const unsigned char hankana2zenkana_table[64] = {
 	0x00,0x02,0x0C,0x0D,0x01,0xFB,0xF2,0xA1,0xA3,0xA5,
 	0xA7,0xA9,0xE3,0xE5,0xE7,0xC3,0xFC,0xA2,0xA4,0xA6,
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c
@ -91,7 +91,6 @@
 #include "filters/mbfilter_base64.h"
 #include "filters/mbfilter_qprint.h"
 #include "filters/mbfilter_singlebyte.h"
-#include "filters/mbfilter_tl_jisx0201_jisx0208.h"
 #include "filters/mbfilter_utf8.h"

 #include "eaw_table.h"
@ -1391,86 +1390,6 @@ mbfl_strimwidth(
 	return result;
 }

-mbfl_string *
-mbfl_ja_jp_hantozen(
-    mbfl_string *string,
-    mbfl_string *result,
-    int mode)
-{
-	size_t n;
-	unsigned char *p;
-	mbfl_memory_device device;
-	mbfl_convert_filter *decoder = NULL;
-	mbfl_convert_filter *encoder = NULL;
-	mbfl_convert_filter *tl_filter = NULL;
-	mbfl_convert_filter *next_filter = NULL;
-
-	mbfl_memory_device_init(&device, string->len, 0);
-	mbfl_string_init(result);
-
-	result->encoding = string->encoding;
-
-	decoder = mbfl_convert_filter_new(
-		&mbfl_encoding_wchar,
-		string->encoding,
-		mbfl_memory_device_output, 0, &device);
-	if (decoder == NULL) {
-		goto out;
-	}
-	next_filter = decoder;
-
-	tl_filter = mbfl_convert_filter_new2(
-		&vtbl_tl_jisx0201_jisx0208,
-		(int(*)(int, void*))next_filter->filter_function,
-		(flush_function_t)next_filter->filter_flush,
-		next_filter);
-	if (tl_filter == NULL) {
-		goto out;
-	}
-
-	tl_filter->opaque = (void*)((intptr_t)mode);
-	next_filter = tl_filter;
-
-	encoder = mbfl_convert_filter_new(
-		string->encoding,
-		&mbfl_encoding_wchar,
-		(int(*)(int, void*))next_filter->filter_function,
-		(flush_function_t)next_filter->filter_flush,
-		next_filter);
-	if (encoder == NULL) {
-		goto out;
-	}
-
-	/* feed data */
-	p = string->val;
-	n = string->len;
-	if (p != NULL) {
-		while (n > 0) {
-			if ((*encoder->filter_function)(*p++, encoder) < 0) {
-				break;
-			}
-			n--;
-		}
-	}
-
-	mbfl_convert_filter_flush(encoder);
-	result = mbfl_memory_device_result(&device, result);
-out:
-	if (tl_filter != NULL) {
-		mbfl_convert_filter_delete(tl_filter);
-	}
-
-	if (decoder != NULL) {
-		mbfl_convert_filter_delete(decoder);
-	}
-
-	if (encoder != NULL) {
-		mbfl_convert_filter_delete(encoder);
-	}
-
-	return result;
-}
-

 /*
 *  MIME header encode
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.h
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.h
@ -288,10 +288,4 @@ mbfl_mime_header_decode(
    mbfl_string *result,
    const mbfl_encoding *outcode);

-/*
- * convert of halfwidth and fullwidth for japanese
- */
-MBFLAPI extern mbfl_string *
-mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, int mode);
-
 #endif	/* MBFL_MBFILTER_H */
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@ -40,8 +40,8 @@
 #include "libmbfl/filters/mbfilter_uuencode.h"
 #include "libmbfl/filters/mbfilter_ucs4.h"
 #include "libmbfl/filters/mbfilter_utf8.h"
-#include "libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h"
 #include "libmbfl/filters/mbfilter_singlebyte.h"
+#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"

 #include "php_variables.h"
 #include "php_globals.h"
@ -2838,37 +2838,291 @@ PHP_FUNCTION(mb_decode_mimeheader)
 }
 /* }}} */

+/* Apply various transforms to input codepoint, such as converting halfwidth katakana
+ * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
+ * actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
+ * `mode` must not call for transforms which are inverses (i.e. which would cancel
+ * each other out).
+ *
+ * In some cases, successive input codepoints may be merged into one output codepoint.
+ * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
+ * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
+ * will not be modified. If there is no following codepoint, `next` should be zero.
+ *
+ * Again, in some cases, one input codepoint may convert to two output codepoints.
+ * If so, the second output codepoint will be stored in `*second`.
+ *
+ * Return the resulting codepoint. If none of the requested transforms apply, return
+ * the input codepoint unchanged.
+ */
+uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
+{
+	if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
+		return c + 0xFEE0;
+	}
+	if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+		return c + 0xFEE0;
+	}
+	if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
+		return c + 0xFEE0;
+	}
+	if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
+		return 0x3000;
+	}
+
+	if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
+		/* Convert Hankaku kana to Zenkaku kana
+		 * Either all Hankaku kana (including katakana and hiragana) will be converted
+		 * to Zenkaku katakana, or to Zenkaku hiragana */
+		if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
+			if (c >= 0xFF61 && c <= 0xFF9F) {
+				int n = c - 0xFF60;
+
+				if (next >= 0xFF61 && next <= 0xFF9F) {
+					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
+						*consumed = true;
+						return 0x3001 + hankana2zenkana_table[n];
+					}
+					if (next == 0xFF9E && n == 19) {
+						*consumed = true;
+						return 0x30F4;
+					}
+					if (next == 0xFF9F && n >= 42 && n <= 46) {
+						*consumed = true;
+						return 0x3002 + hankana2zenkana_table[n];
+					}
+				}
+
+				return 0x3000 + hankana2zenkana_table[n];
+			}
+		}
+		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
+			if (c >= 0xFF61 && c <= 0xFF9F) {
+				int n = c - 0xFF60;
+
+				if (next >= 0xFF61 && next <= 0xFF9F) {
+					if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
+						*consumed = true;
+						return 0x3001 + hankana2zenhira_table[n];
+					}
+					if (next == 0xFF9F && n >= 42 && n <= 46) {
+						*consumed = true;
+						return 0x3002 + hankana2zenhira_table[n];
+					}
+				}
+
+				return 0x3000 + hankana2zenhira_table[n];
+			}
+		}
+		if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
+			return 0x3000 + hankana2zenkana_table[c - 0xFF60];
+		}
+		if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
+			return 0x3000 + hankana2zenhira_table[c - 0xFF60];
+		}
+	}
+
+	if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
+		if (c == '\\' || c == 0xA5) { /* YEN SIGN */
+			return 0xFFE5; /* FULLWIDTH YEN SIGN */
+		}
+		if (c == 0x7E || c == 0x203E) {
+			return 0xFFE3; /* FULLWIDTH MACRON */
+		}
+		if (c == '\'') {
+			return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
+		}
+		if (c == '"') {
+			return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
+		}
+	}
+
+	if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
+		/* Zenkaku to Hankaku */
+		if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
+			/* all except " ' \ ~ */
+			return c - 0xFEE0;
+		}
+		if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
+			return c - 0xFEE0;
+		}
+		if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
+			return c - 0xFEE0;
+		}
+		if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
+			return ' ';
+		}
+		if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
+			return '-';
+		}
+	}
+
+	if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
+		/* Zenkaku kana to hankaku kana */
+		if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
+			/* Zenkaku katakana to hankaku kana */
+			int n = c - 0x30A1;
+			if (zenkana2hankana_table[n][1]) {
+				*second = 0xFF00 + zenkana2hankana_table[n][1];
+			}
+			return 0xFF00 + zenkana2hankana_table[n][0];
+		}
+		if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
+			/* Zenkaku hiragana to hankaku kana */
+			int n = c - 0x3041;
+			if (zenkana2hankana_table[n][1]) {
+				*second = 0xFF00 + zenkana2hankana_table[n][1];
+			}
+			return 0xFF00 + zenkana2hankana_table[n][0];
+		}
+		if (c == 0x3001) {
+			return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
+		}
+		if (c == 0x3002) {
+			return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
+		}
+		if (c == 0x300C) {
+			return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
+		}
+		if (c == 0x300D) {
+			return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
+		}
+		if (c == 0x309B) {
+			return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
+		}
+		if (c == 0x309C) {
+			return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
+		}
+		if (c == 0x30FC) {
+			return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+		}
+		if (c == 0x30FB) {
+			return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
+		}
+	}
+
+	if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
+		if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
+			/* Zenkaku hiragana to Zenkaku katakana */
+			return c + 0x60;
+		}
+		if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
+			/* Zenkaku katakana to Zenkaku hiragana */
+			return c - 0x60;
+		}
+	}
+
+	if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
+		if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
+			return '\\';
+		}
+		if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
+			return '~';
+		}
+		if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
+			return '\'';
+		}
+		if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
+			return '"';
+		}
+	}
+
+	return c;
+}
+
+static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
+{
+	/* Each wchar may potentially expand to 2 when we perform kana conversion...
+	 * if we are converting zenkaku kana to hankaku kana
+	 * Make the buffer for converted kana big enough that we never need to
+	 * perform bounds checks */
+	uint32_t wchar_buf[64], converted_buf[64 * 2];
+	unsigned int buf_offset = 0;
+	unsigned int state = 0;
+	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
+	size_t in_len = ZSTR_LEN(input);
+
+	mb_convert_buf buf;
+	mb_convert_buf_init(&buf, in_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
+
+	while (in_len) {
+		uint32_t *converted = converted_buf;
+		/* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
+		 * previous iteration, don't overwrite it */
+		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
+		out_len += buf_offset;
+		ZEND_ASSERT(out_len <= 64);
+
+		if (!out_len) {
+			continue;
+		}
+
+		for (int i = 0; i < out_len-1; i++) {
+			uint32_t second = 0;
+			bool consumed = false;
+			*converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
+			if (second) {
+				*converted++ = second;
+			}
+			if (consumed) {
+				i++;
+				if (i == out_len-1) {
+					/* We consumed two codepoints at the very end of the wchar buffer
+					 * So there is nothing remaining to reprocess on the next iteration */
+					buf_offset = 0;
+					goto emit_converted_kana;
+				}
+			}
+		}
+
+		if (!in_len) {
+			/* This is the last iteration, so we need to process the final codepoint now */
+			uint32_t second = 0;
+			*converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
+			if (second) {
+				*converted++ = second;
+			}
+		} else {
+			/* Reprocess the last codepoint on the next iteration */
+			wchar_buf[0] = wchar_buf[out_len-1];
+			buf_offset = 1;
+		}
+
+emit_converted_kana:
+		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
+	}
+
+	return mb_convert_buf_result(&buf);
+}
+
 char mb_convert_kana_flags[17] = {
 	'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
 	'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
 	'V'
 };

-/* {{{ Conversion between full-width character and half-width character (Japanese) */
+/* Conversion between full-width characters and half-width characters (Japanese) */
 PHP_FUNCTION(mb_convert_kana)
 {
-	int opt;
-	mbfl_string string, result, *ret;
-	char *optstr = NULL, *string_val;
+	unsigned int opt;
+	char *optstr = NULL;
 	size_t optstr_len;
-	zend_string *encname = NULL;
+	zend_string *encname = NULL, *str;

 	ZEND_PARSE_PARAMETERS_START(1, 3)
-		Z_PARAM_STRING(string_val, string.len)
+		Z_PARAM_STR(str)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_STRING(optstr, optstr_len)
 		Z_PARAM_STR_OR_NULL(encname)
 	ZEND_PARSE_PARAMETERS_END();

-	string.val = (unsigned char*)string_val;
-
 	if (optstr != NULL) {
 		char *p = optstr, *e = p + optstr_len;
 		opt = 0;
 next_option:
 		while (p < e) {
 			/* Walk through option string and convert to bit vector
-			 * See mbfilter_tl_jisx0201_jisx0208.h for the values used */
+			 * See translit_kana_jisx0201_jisx0208.h for the values used */
 			char c = *p++;
 			if (c == 'A') {
 				opt |= MBFL_HAN2ZEN_ALL | MBFL_HAN2ZEN_ALPHA | MBFL_HAN2ZEN_NUMERIC;
@ -2936,19 +3190,13 @@ next_option:
 		opt = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
 	}

-	/* encoding */
-	string.encoding = php_mb_get_encoding(encname, 3);
-	if (!string.encoding) {
+	const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
+	if (!enc) {
 		RETURN_THROWS();
 	}

-	ret = mbfl_ja_jp_hantozen(&string, &result, opt);
-	ZEND_ASSERT(ret != NULL);
-	// TODO: avoid reallocation ???
-	RETVAL_STRINGL((char *)ret->val, ret->len);		/* the string is already strdup()'ed */
-	efree(ret->val);
+	RETVAL_STR(jp_kana_convert(str, enc, opt));
 }
-/* }}} */

 static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
 {
--- a/ext/mbstring/tests/mb_convert_kana.phpt
+++ b/ext/mbstring/tests/mb_convert_kana.phpt
@ -132,6 +132,17 @@ try {
  echo $e->getMessage() . "\n";
 }

+// Regression test: Two codepoints collapsed into one, just one position
+// before the end of the string
+$converted = mb_convert_kana("\xb9\xde\xde", 'HV', 'JIS');
+if ($converted !== "\x1b\$B\$2!+\x1b(B")
+  echo "Failed! Expected " . bin2hex("\x1b\$B\$2!+\x1b(B") . ", got: " . bin2hex($converted) . "\n";
+
+// Regression test: the old implementation of mb_convert_kana would swallow
+// zero bytes in some cases
+if (mb_convert_kana("abc\x00abc", 'c', 'ASCII') !== "abc\x00abc")
+  echo "mb_convert_kana is swallowing zero bytes!\n";
+
 ?>
 --EXPECT--
 'A': ァアィイゥウェエォオカガキギク => ァアィイゥウェエォオカガキギク