From 0bdbc4e356d81fd2fd7e1f2391e24e76c2f3b8f6 Mon Sep 17 00:00:00 2001 From: Scott MacVicar Date: Fri, 2 Jan 2009 03:02:22 +0000 Subject: [PATCH] MFH Fix bug #46944 - UTF-8 characters outside the BMP aren't encoded correctly. --- ext/json/tests/bug46944.phpt | 32 ++++++++++++++++++++++++++++++++ ext/json/utf8_decode.c | 2 +- ext/json/utf8_to_utf16.c | 2 +- 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 ext/json/tests/bug46944.phpt diff --git a/ext/json/tests/bug46944.phpt b/ext/json/tests/bug46944.phpt new file mode 100644 index 00000000000..735de044357 --- /dev/null +++ b/ext/json/tests/bug46944.phpt @@ -0,0 +1,32 @@ +--TEST-- +Bug #46944 (json_encode() doesn't handle 3 byte utf8 correctly) +--SKIPIF-- + +--FILE-- +> 2)) . (0x8f|($i & 3) << 4) . "\xbf\xbdzz") . "\n"; +} + + +echo "Done\n"; +?> +--EXPECT-- +"aa\ud83f\udffdzz" +"aa\ud87f\udffdzz" +"aa\ud8bf\udffdzz" +"aa\ud8ff\udffdzz" +"aa\ud93f\udffdzz" +"aa\ud97f\udffdzz" +"aa\ud9bf\udffdzz" +"aa\ud9ff\udffdzz" +"aa\uda3f\udffdzz" +"aa\uda7f\udffdzz" +"aa\udabf\udffdzz" +"aa\udaff\udffdzz" +"aa\udb3f\udffdzz" +"aa\udb7f\udffdzz" +"aa\udbbf\udffdzz" +"aa\udbff\udffdzz" +Done diff --git a/ext/json/utf8_decode.c b/ext/json/utf8_decode.c index cea1f8cec8d..2d0422bedb6 100644 --- a/ext/json/utf8_decode.c +++ b/ext/json/utf8_decode.c @@ -165,7 +165,7 @@ utf8_decode_next(json_utf8_decode *utf8) /* Three continuation (65536 to 1114111) */ - if ((c & 0xF1) == 0xF0) { + if ((c & 0xF8) == 0xF0) { int c1 = cont(utf8); int c2 = cont(utf8); int c3 = cont(utf8); diff --git a/ext/json/utf8_to_utf16.c b/ext/json/utf8_to_utf16.c index 42ea9e5d8eb..599f0e13b48 100644 --- a/ext/json/utf8_to_utf16.c +++ b/ext/json/utf8_to_utf16.c @@ -46,7 +46,7 @@ utf8_to_utf16(unsigned short w[], char p[], int length) w[the_index] = (unsigned short)c; the_index += 1; } else { - c &= 0xFFFF; + c -= 0x10000; w[the_index] = (unsigned short)(0xD800 | (c >> 10)); the_index += 1; w[the_index] = (unsigned short)(0xDC00 | (c & 0x3FF));