mirror of
https://github.com/php/php-src.git
synced 2024-11-25 10:54:15 +08:00
Test behavior of 'long' illegal character markers
After mb_substitute_character("long"), mbstring will respond to erroneous input by inserting 'long' error markers into the output. Depending on the situation, these error markers will either look like BAD+XXXX (for general bad input), U+XXXX (when the input is OK, but it converts to Unicode codepoints which cannot be represented in the output encoding), or an encoding-specific marker like JISX+XXXX or W932+XXXX. We have almost no tests for this feature. Add a bunch of tests to ensure that all our legacy encoding handlers work in a reasonable way when 'long' error markers are enabled.
This commit is contained in:
parent
f6f0506c84
commit
51b9d7a5e1
@ -151,9 +151,7 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
|
||||
filter->status = 1;
|
||||
filter->cache = c;
|
||||
} else {
|
||||
w = c & MBFL_WCSGROUP_MASK;
|
||||
w |= MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(w, filter->data));
|
||||
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
break;
|
||||
|
||||
@ -215,7 +213,7 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
|
||||
static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
|
||||
{
|
||||
if (filter->status) {
|
||||
(*filter->filter_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter);
|
||||
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
|
@ -148,7 +148,7 @@ int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
|
||||
filter->cache = (c1 << 8) + c - 0xA1;
|
||||
} else {
|
||||
filter->status = filter->cache = 0;
|
||||
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
|
||||
w = 0x8E0000 | ((c1 + 0xA1) << 8) | c | MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(w, filter->data));
|
||||
}
|
||||
break;
|
||||
@ -179,7 +179,7 @@ int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
|
||||
CK((*filter->output_function)(w, filter->data));
|
||||
} else {
|
||||
filter->status = filter->cache = 0;
|
||||
w = (c1 << 8) | c | 0x8e0000 | MBFL_WCSGROUP_THROUGH;
|
||||
w = ((c1 + 0xA1A1) << 8) | c | MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(w, filter->data));
|
||||
}
|
||||
break;
|
||||
|
@ -174,9 +174,17 @@ int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
|
||||
static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
|
||||
{
|
||||
if (filter->status & 0xF) {
|
||||
/* 2-byte character or escape sequence was truncated */
|
||||
if (filter->status == 2) {
|
||||
CK((*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
} else if (filter->status == 3) {
|
||||
CK((*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
} else if (filter->status == 4) {
|
||||
CK((*filter->output_function)(0x1B2429 | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
} else {
|
||||
/* 2-byte character was truncated */
|
||||
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
(*filter->flush_function)(filter->data);
|
||||
|
@ -241,7 +241,13 @@ int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
|
||||
static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
|
||||
{
|
||||
if (filter->status & 0xF) {
|
||||
mbfl_filt_conv_illegal_output(filter->cache, filter);
|
||||
if ((filter->status & 0xF) == 2) {
|
||||
(*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else if ((filter->status & 0xF) == 3) {
|
||||
(*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else {
|
||||
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
|
@ -155,9 +155,7 @@ retry:
|
||||
w = 0;
|
||||
}
|
||||
if (w <= 0) {
|
||||
w = (c1 << 8) | c;
|
||||
w &= MBFL_WCSPLANE_MASK;
|
||||
w |= MBFL_WCSPLANE_JIS0208;
|
||||
w = (c1 << 8) | c | MBFL_WCSPLANE_JIS0208;
|
||||
}
|
||||
} else {
|
||||
if (s >= 0 && s < jisx0212_ucs_table_size) {
|
||||
@ -166,16 +164,12 @@ retry:
|
||||
w = 0;
|
||||
}
|
||||
if (w <= 0) {
|
||||
w = (c1 << 8) | c;
|
||||
w &= MBFL_WCSPLANE_MASK;
|
||||
w |= MBFL_WCSPLANE_JIS0212;
|
||||
w = (c1 << 8) | c | MBFL_WCSPLANE_JIS0212;
|
||||
}
|
||||
}
|
||||
CK((*filter->output_function)(w, filter->data));
|
||||
} else {
|
||||
w = (c1 << 8) | c;
|
||||
w &= MBFL_WCSGROUP_MASK;
|
||||
w |= MBFL_WCSGROUP_THROUGH;
|
||||
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(w, filter->data));
|
||||
}
|
||||
break;
|
||||
|
@ -188,14 +188,14 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
|
||||
s1 = c1 - 0x80;
|
||||
s2 = c - 0x80;
|
||||
} else {
|
||||
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
break;
|
||||
}
|
||||
} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
|
||||
if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
|
||||
SJIS_DECODE(c1, c, s1, s2);
|
||||
} else {
|
||||
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
break;
|
||||
}
|
||||
} else { /* ISO-2022-JP-2004 */
|
||||
@ -203,7 +203,7 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
|
||||
s1 = c1;
|
||||
s2 = c;
|
||||
} else {
|
||||
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
|
||||
n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
|
||||
if (n >= 0xD800 && n <= 0xDBFF) {
|
||||
/* Wrong; that's the first half of a surrogate pair, not the second */
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(0xD800 | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = n & 0x3FF;
|
||||
filter->status = 2;
|
||||
} else if (n >= 0xDC00 && n <= 0xDFFF) {
|
||||
@ -203,7 +203,7 @@ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
} else {
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(0xD800 | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
}
|
||||
@ -269,7 +269,7 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
|
||||
case 3:
|
||||
n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
|
||||
if (n >= 0xD800 && n <= 0xDBFF) {
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(0xD800 | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = n & 0x3FF;
|
||||
filter->status = 2;
|
||||
} else if (n >= 0xDC00 && n <= 0xDFFF) {
|
||||
@ -277,7 +277,7 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
} else {
|
||||
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(0xD800 | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
CK((*filter->output_function)(n, filter->data));
|
||||
filter->status = 0;
|
||||
}
|
||||
@ -316,7 +316,11 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
|
||||
|
||||
if (status) {
|
||||
/* Input string was truncated */
|
||||
if (status == 1) {
|
||||
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
} else if (status == 2) {
|
||||
CK((*filter->output_function)(0xD800 | cache | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
|
@ -107,8 +107,12 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
|
||||
if (filter->cache) {
|
||||
/* Either we were expecting the 2nd half of a surrogate pair which
|
||||
* never came, or else the last Base64 data was not padded with zeroes */
|
||||
if (filter->cache & 0xfff0000) {
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else {
|
||||
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
}
|
||||
if (c == '-') {
|
||||
if (filter->status == 1) { /* "+-" -> "+" */
|
||||
CK((*filter->output_function)('+', filter->data));
|
||||
@ -150,13 +154,16 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
|
||||
n = (n & 0x3) << 14;
|
||||
filter->status = 5;
|
||||
if (s >= 0xd800 && s < 0xdc00) {
|
||||
/* 1st part of surrogate pair */
|
||||
if (filter->cache & 0xfff0000) {
|
||||
/* We were waiting for the 2nd part of a surrogate pair */
|
||||
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
s = (((s & 0x3ff) << 16) + 0x400000) | n;
|
||||
filter->cache = s;
|
||||
} else if (s >= 0xdc00 && s < 0xe000) {
|
||||
/* 2nd part of surrogate pair */
|
||||
if (filter->cache & 0xfff0000) {
|
||||
s &= 0x3ff;
|
||||
s |= (filter->cache & 0xfff0000) >> 6;
|
||||
filter->cache = n;
|
||||
@ -165,10 +172,14 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
|
||||
} else { /* illegal character */
|
||||
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
} else {
|
||||
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = n;
|
||||
}
|
||||
} else {
|
||||
if (filter->cache & 0xfff0000) {
|
||||
/* We were waiting for the 2nd part of a surrogate pair */
|
||||
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
filter->cache = n;
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
@ -190,11 +201,13 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
|
||||
if (s >= 0xd800 && s < 0xdc00) {
|
||||
if (filter->cache & 0xfff0000) {
|
||||
/* We were waiting for the 2nd part of a surrogate pair */
|
||||
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
s = (((s & 0x3ff) << 16) + 0x400000) | n;
|
||||
filter->cache = s;
|
||||
} else if (s >= 0xdc00 && s < 0xe000) {
|
||||
/* 2nd part of surrogate pair */
|
||||
if (filter->cache & 0xfff0000) {
|
||||
s &= 0x3ff;
|
||||
s |= (filter->cache & 0xfff0000) >> 6;
|
||||
filter->cache = n;
|
||||
@ -203,10 +216,14 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
|
||||
} else { /* illegal character */
|
||||
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
} else {
|
||||
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = n;
|
||||
}
|
||||
} else {
|
||||
if (filter->cache & 0xfff0000) {
|
||||
/* We were waiting for the 2nd part of a surrogate pair */
|
||||
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
filter->cache = n;
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
@ -223,25 +240,28 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
|
||||
if (s >= 0xd800 && s < 0xdc00) {
|
||||
if (filter->cache & 0xfff0000) {
|
||||
/* We were waiting for the 2nd part of a surrogate pair */
|
||||
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
s = (((s & 0x3ff) << 16) + 0x400000);
|
||||
filter->cache = s;
|
||||
} else if (s >= 0xdc00 && s < 0xe000) {
|
||||
if (filter->cache & 0xfff0000) {
|
||||
s &= 0x3ff;
|
||||
s |= (filter->cache & 0xfff0000) >> 6;
|
||||
filter->cache = 0;
|
||||
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
} else { /* illegal character */
|
||||
s &= MBFL_WCSGROUP_MASK;
|
||||
s |= MBFL_WCSGROUP_THROUGH;
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
}
|
||||
} else {
|
||||
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
|
||||
filter->cache = 0;
|
||||
}
|
||||
} else {
|
||||
if (filter->cache & 0xfff0000) {
|
||||
/* We were waiting for the 2nd part of a surrogate pair */
|
||||
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
filter->cache = 0;
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
@ -261,8 +281,12 @@ static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter)
|
||||
if (filter->cache) {
|
||||
/* Either we were expecting the 2nd half of a surrogate pair which
|
||||
* never came, or else the last Base64 data was not padded with zeroes */
|
||||
if (filter->cache & 0xfff0000) {
|
||||
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
} else {
|
||||
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
(*filter->flush_function)(filter->data);
|
||||
|
@ -96,7 +96,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
|
||||
int s, c1;
|
||||
|
||||
retry:
|
||||
switch (filter->status & 0xff) {
|
||||
switch (filter->status) {
|
||||
case 0x00:
|
||||
if (c < 0x80) {
|
||||
CK((*filter->output_function)(c, filter->data));
|
||||
@ -116,15 +116,31 @@ retry:
|
||||
case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
|
||||
case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
|
||||
case 0x32: /* 4byte code 4th char: 0x80-0xbf */
|
||||
filter->status = 0;
|
||||
if (c >= 0x80 && c <= 0xbf) {
|
||||
s = (filter->cache<<6) | (c & 0x3f);
|
||||
filter->cache = 0;
|
||||
filter->status = filter->cache = 0;
|
||||
CK((*filter->output_function)(s, filter->data));
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||
int status = filter->status;
|
||||
filter->status = 0;
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
if (status == 0x10) {
|
||||
CK(mbfl_filt_put_invalid_char(0xC0 | filter->cache, filter));
|
||||
} else if (status == 0x21) {
|
||||
CK(mbfl_filt_put_invalid_char(0xE080 | ((filter->cache & ~0x3F) << 2) | (filter->cache & 0x3F), filter));
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(0xF08080 | ((filter->cache & ~0xFFF) << 4) | ((filter->cache & 0xFC0) << 2) | (filter->cache & 0x3F), filter));
|
||||
}
|
||||
goto retry;
|
||||
} else {
|
||||
if (status == 0x10) {
|
||||
CK(mbfl_filt_put_invalid_char(0xC000 | (filter->cache << 8) | c, filter));
|
||||
} else if (status == 0x21) {
|
||||
CK(mbfl_filt_put_invalid_char(0xE08000 | ((filter->cache & ~0x3F) << 10) | ((filter->cache & 0x3F) << 8) | c, filter));
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(0x808000 | ((filter->cache & 0xFC0) << 10) | ((filter->cache & 0x3F) << 8) | c, filter));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
|
||||
@ -138,9 +154,13 @@ retry:
|
||||
filter->cache = s;
|
||||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
CK(mbfl_filt_put_invalid_char(0xE0 | filter->cache, filter));
|
||||
goto retry;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(0xE000 | (filter->cache << 8) | c, filter));
|
||||
filter->status = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
|
||||
@ -154,9 +174,13 @@ retry:
|
||||
filter->cache = s;
|
||||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
CK(mbfl_filt_put_invalid_char(0xF0 | filter->cache, filter));
|
||||
goto retry;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(0xF000 | (filter->cache << 8) | c, filter));
|
||||
filter->status = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
|
||||
@ -164,9 +188,13 @@ retry:
|
||||
filter->cache = (filter->cache<<6) | (c & 0x3f);
|
||||
filter->status++;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
|
||||
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
|
||||
CK(mbfl_filt_put_invalid_char(0xF080 | ((filter->cache & ~0x3F) << 2) | (filter->cache & 0x3F), filter));
|
||||
goto retry;
|
||||
} else {
|
||||
CK(mbfl_filt_put_invalid_char(0xF000 | (filter->cache << 8) | c, filter));
|
||||
filter->status = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -184,7 +212,19 @@ int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
|
||||
filter->status = filter->cache = 0;
|
||||
|
||||
if (status) {
|
||||
CK(mbfl_filt_put_invalid_char(cache, filter));
|
||||
if (status == 0x10) {
|
||||
CK(mbfl_filt_put_invalid_char(0xC0 | cache, filter));
|
||||
} else if (status == 0x20) {
|
||||
CK(mbfl_filt_put_invalid_char(0xE0 | cache, filter));
|
||||
} else if (status == 0x21) {
|
||||
CK(mbfl_filt_put_invalid_char(0xE080 | ((cache & ~0x3F) << 2) | (cache & 0x3F), filter));
|
||||
} else if (status == 0x30) {
|
||||
CK(mbfl_filt_put_invalid_char(0xF0 | cache, filter));
|
||||
} else if (status == 0x31) {
|
||||
CK(mbfl_filt_put_invalid_char(0xF080 | ((cache & ~0x3F) << 2) | (cache & 0x3F), filter));
|
||||
} else if (status == 0x32) {
|
||||
CK(mbfl_filt_put_invalid_char(0xF08080 | ((cache & ~0xFFF) << 4) | ((cache & 0xFC0) << 2) | (cache & 0x3F), filter));
|
||||
}
|
||||
}
|
||||
|
||||
if (filter->flush_function) {
|
||||
|
@ -27,7 +27,15 @@ echo "Tested ARMSCII-8 -> UTF-16BE\n";
|
||||
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
|
||||
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'ARMSCII-8', '%');
|
||||
echo "Tested UTF-16BE -> ARMSCII-8\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xA1", "BAD+A1", "ARMSCII-8", "UTF-8");
|
||||
convertInvalidString("\xFF", "BAD+FF", "ARMSCII-8", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested ARMSCII-8 -> UTF-16BE
|
||||
Tested UTF-16BE -> ARMSCII-8
|
||||
Done!
|
||||
|
@ -33,7 +33,17 @@ testAllValidChars($fromUnicode, 'UTF-16BE', 'BIG5', false);
|
||||
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
|
||||
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'BIG5', '%');
|
||||
echo "Tested UTF-16BE -> BIG5\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "BIG5", "UTF-8");
|
||||
convertInvalidString("\xB0\x9F", "BAD+B09F", "BIG5", "UTF-8");
|
||||
convertInvalidString("\xA3\xED", "?+A3ED", "BIG5", "UTF-8");
|
||||
convertInvalidString("\x76\x54", "U+7654", "UTF-16BE", "BIG5");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested BIG5 -> UTF-16BE
|
||||
Tested UTF-16BE -> BIG5
|
||||
Done!
|
||||
|
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1251.txt', 'CP1251');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x98", "BAD+98", "CP1251", "UTF-8");
|
||||
convertInvalidString("\x12\x34", "U+1234", "UTF-16BE", "CP1251");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested CP1251 -> UTF-16BE
|
||||
Tested UTF-16BE -> CP1251
|
||||
Done!
|
||||
|
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x81", "BAD+81", "CP1252", "UTF-8");
|
||||
convertInvalidString("\x9D", "BAD+9D", "CP1252", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested CP1252 -> UTF-16BE
|
||||
Tested UTF-16BE -> CP1252
|
||||
Done!
|
||||
|
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1254.txt', 'CP1254');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x81", "BAD+81", "CP1254", "UTF-8");
|
||||
convertInvalidString("\x9E", "BAD+9E", "CP1254", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested CP1254 -> UTF-16BE
|
||||
Tested UTF-16BE -> CP1254
|
||||
Done!
|
||||
|
@ -292,6 +292,16 @@ testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');
|
||||
|
||||
echo "Invalid Unicode is flagged when converting to CP5022x\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "CP50220", "UTF-8");
|
||||
convertInvalidString("\x80", "BAD+80", "CP50221", "UTF-8");
|
||||
convertInvalidString("\x80", "BAD+80", "CP50222", "UTF-8");
|
||||
convertInvalidString("\x1B\$B1", "BAD+31", "CP50220", "UTF-8");
|
||||
convertInvalidString("\x1B\$B1", "BAD+31", "CP50221", "UTF-8");
|
||||
convertInvalidString("\x1B\$B1", "BAD+31", "CP50222", "UTF-8");
|
||||
|
||||
echo "Long error markers OK\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
ASCII support OK
|
||||
@ -299,3 +309,4 @@ JIS X 0201 support OK
|
||||
CP932 support OK
|
||||
Folding of fullwidth katakana for CP50220 OK
|
||||
Invalid Unicode is flagged when converting to CP5022x
|
||||
Long error markers OK
|
||||
|
@ -108,8 +108,16 @@ echo "CP51932 verification and conversion works on all invalid characters\n";
|
||||
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
|
||||
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
|
||||
echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "CP51932", "UTF-8");
|
||||
convertInvalidString("\xFE\xFF", "BAD+FEFF", "CP51932", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
CP51932 verification and conversion works on all valid characters
|
||||
CP51932 verification and conversion works on all invalid characters
|
||||
Unicode -> CP51932 conversion works on all invalid codepoints
|
||||
Done!
|
||||
|
@ -105,8 +105,18 @@ echo "CP932 verification and conversion works on all invalid characters\n";
|
||||
|
||||
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
|
||||
echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "CP932", "UTF-8");
|
||||
convertInvalidString("\xEA", "BAD+EA", "CP932", "UTF-8");
|
||||
convertInvalidString("\x81\x20", "BAD+8120", "CP932", "UTF-8");
|
||||
convertInvalidString("\xEA\xA9", "W932+742B", "CP932", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
CP932 verification and conversion works on all valid characters
|
||||
CP932 verification and conversion works on all invalid characters
|
||||
Unicode -> CP932 conversion works on all invalid codepoints
|
||||
Done!
|
||||
|
@ -324,7 +324,15 @@ findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF),
|
||||
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP936', '%');
|
||||
echo "Tested UTF-16BE -> CP936\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x81\x20", "BAD+8120", "CP936", "UTF-8");
|
||||
convertInvalidString("\x81\x7F", "BAD+817F", "CP936", "UTF-8");
|
||||
convertInvalidString("\xFE\xFF", "BAD+FEFF", "CP936", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested CP936 -> UTF-16BE
|
||||
Tested UTF-16BE -> CP936
|
||||
Done!
|
||||
|
@ -75,7 +75,15 @@ testAllValidChars($fromUnicode, 'UTF-16BE', 'CP950', false);
|
||||
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
|
||||
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP950', '%');
|
||||
echo "Tested UTF-16BE -> CP950\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "CP950", "UTF-8");
|
||||
convertInvalidString("\x26\x09", "U+2609", "UTF-16BE", "CP950");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested CP950 -> UTF-16BE
|
||||
Tested UTF-16BE -> CP950
|
||||
Done!
|
||||
|
@ -10,7 +10,16 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/EUC-CN.txt', 'EUC-CN');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "EUC-CN", "UTF-8");
|
||||
convertInvalidString("\xA1\x50", "BAD+A150", "EUC-CN", "UTF-8");
|
||||
convertInvalidString("\xF7\xFF", "BAD+F7FF", "EUC-CN", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested EUC-CN -> UTF-16BE
|
||||
Tested UTF-16BE -> EUC-CN
|
||||
Done!
|
||||
|
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/EUC-KR.txt', 'EUC-KR');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "EUC-KR", "UTF-8");
|
||||
convertInvalidString("\xA7\xF0", "?+A7F0", "EUC-KR", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested EUC-KR -> UTF-16BE
|
||||
Tested UTF-16BE -> EUC-KR
|
||||
Done!
|
||||
|
@ -25,7 +25,20 @@ findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF),
|
||||
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'EUC-TW', '%');
|
||||
echo "Tested UTF-16BE -> EUC-TW\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\x8E\x20", "BAD+8E20", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\x8E\xA1\x20", "BAD+8EA120", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\x8E\xA1\xA1\x20", "BAD+A1A120", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\x8E\xA2\xA3\x20", "BAD+A2A320", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\x8F", "BAD+8F", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\xA1\x50", "BAD+A150", "EUC-TW", "UTF-8");
|
||||
convertInvalidString("\xFD\xCC", "?+FDCC", "EUC-TW", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested EUC-TW -> UTF-16BE
|
||||
Tested UTF-16BE -> EUC-TW
|
||||
Done!
|
||||
|
@ -69,9 +69,17 @@ echo "Unicode -> EUC-JP-2004 conversion works on all valid characters\n";
|
||||
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
|
||||
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'EUC-JP-2004', '%');
|
||||
echo "Unicode -> EUC-JP-2004 conversion works on all invalid characters\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "EUC-JP-2004", "UTF-8");
|
||||
convertInvalidString("\xFE\xFF", "BAD+FEFF", "EUC-JP-2004", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
EUC-JP-2004 verification and conversion works for all valid characters
|
||||
EUC-JP-2004 verification and conversion rejects all invalid characters
|
||||
Unicode -> EUC-JP-2004 conversion works on all valid characters
|
||||
Unicode -> EUC-JP-2004 conversion works on all invalid characters
|
||||
Done!
|
||||
|
@ -76,6 +76,13 @@ for ($cp = 0; $cp <= 0xFFFF; $cp++) {
|
||||
}
|
||||
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-32BE', 'EUC-JP', '%');
|
||||
echo "Unicode -> EUC-JP conversion works on all invalid characters\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "EUC-JP", "UTF-8");
|
||||
convertInvalidString("\xFE\xFF", "BAD+FEFF", "EUC-JP", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Encoding verification and conversion work for all valid characters
|
||||
@ -83,3 +90,4 @@ Encoding verification and conversion work for all invalid characters
|
||||
Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly
|
||||
Unicode -> EUC-JP conversion works on all valid characters
|
||||
Unicode -> EUC-JP conversion works on all invalid characters
|
||||
Done!
|
||||
|
@ -300,8 +300,15 @@ echo "Tested GB18030 4-byte characters <-> UTF-16BE\n";
|
||||
testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false);
|
||||
echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x81\x30\x81\xFF", "BAD+3081FF", "GB18030", "UTF-8");
|
||||
convertInvalidString("\xE3\x32\x9A\x36", "BAD+329A36", "GB18030", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested GB18030 (1 and 2 byte characters) -> UTF-16BE
|
||||
Tested GB18030 4-byte characters <-> UTF-16BE
|
||||
Tested UTF-16BE -> GB18030 (1 and 2 byte characters)
|
||||
Done!
|
||||
|
@ -118,6 +118,13 @@ while (!empty($badChars)) {
|
||||
|
||||
echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("~A", "BAD+41", "HZ", "UTF-8");
|
||||
convertInvalidString("\x80", "BAD+80", "HZ", "UTF-8");
|
||||
convertInvalidString("~{\x22\x21", "?+2221", "HZ", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested ASCII -> HZ
|
||||
@ -127,3 +134,4 @@ Tested valid ~ escapes
|
||||
Tested all invalid ~ escapes
|
||||
Tested HZ -> UTF-16BE (for all GB2312 characters)
|
||||
Tested UTF-16BE -> HZ (for all GB2312 characters)
|
||||
Done!
|
||||
|
@ -314,6 +314,12 @@ for ($i = 0; $i < 100; $i++) {
|
||||
testValid($testString, $convertsTo, false);
|
||||
}
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-2004", "UTF-8");
|
||||
convertInvalidString("\x1B\$(X", "BAD+1B2428", "ISO-2022-JP-2004", "UTF-8"); // Invalid escape
|
||||
convertInvalidString("\x1B\$B!", "BAD+21", "ISO-2022-JP-2004", "UTF-8"); // Truncated character
|
||||
|
||||
echo "All done!\n";
|
||||
|
||||
?>
|
||||
|
@ -187,6 +187,16 @@ for ($i = 0; $i <= 0xFF; $i++) {
|
||||
|
||||
echo "All escape sequences work as expected\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xE0", "BAD+E0", "JIS", "UTF-8");
|
||||
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP", "UTF-8");
|
||||
convertInvalidString("\x1B\$(X", "BAD+1B\$(X", "JIS", "UTF-8"); // Invalid escape
|
||||
convertInvalidString("\x1B\$(X", "BAD+1B\$(X", "ISO-2022-JP", "UTF-8"); // Invalid escape
|
||||
convertInvalidString("\x1B\$B!", "BAD+21", "JIS", "UTF-8"); // Truncated character
|
||||
convertInvalidString("\x1B\$B!", "BAD+21", "ISO-2022-JP", "UTF-8"); // Truncated character
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
ASCII support OK
|
||||
@ -194,3 +204,4 @@ JIS X 0201 support OK
|
||||
JIS X 0208 support OK
|
||||
JIS X 0212 support OK
|
||||
All escape sequences work as expected
|
||||
Done!
|
||||
|
@ -199,8 +199,21 @@ foreach (array_keys($truncatedChars) as $truncated)
|
||||
|
||||
echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-KDDI", "UTF-8");
|
||||
// Invalid escapes:
|
||||
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-JP-KDDI", "UTF-8");
|
||||
convertInvalidString("\x1B.", "BAD+1B2E", "ISO-2022-JP-KDDI", "UTF-8");
|
||||
convertInvalidString("\x1B\$", "BAD+1B24", "ISO-2022-JP-KDDI", "UTF-8");
|
||||
convertInvalidString("\x1B\$.", "BAD+1B242E", "ISO-2022-JP-KDDI", "UTF-8");
|
||||
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-KDDI", "UTF-8");
|
||||
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
ASCII support OK
|
||||
JIS X 0201 support OK
|
||||
JIS X 0208 (with MS extensions) and KDDI emoji support OK
|
||||
Done!
|
||||
|
@ -198,9 +198,17 @@ foreach (array_keys($truncatedChars) as $truncated)
|
||||
|
||||
echo "UDC support OK\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-MS", "UTF-8");
|
||||
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8"); // Invalid escape
|
||||
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
ASCII support OK
|
||||
JIS X 0201 support OK
|
||||
JIS X 0208 (with MS extensions) support OK
|
||||
UDC support OK
|
||||
Done!
|
||||
|
@ -96,9 +96,18 @@ testValid("\x0E\x0E\x0F\x0E\x0Fabc", "\x00a\x00b\x00c", false);
|
||||
|
||||
echo "Escapes behave as expected\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-KR", "UTF-8");
|
||||
convertInvalidString("\x1B$", "BAD+1B24", "ISO-2022-KR", "UTF-8");
|
||||
convertInvalidString("\x1B$)", "BAD+1B2429", "ISO-2022-KR", "UTF-8");
|
||||
convertInvalidString("\x1B$)C\x0E\x7C\x84", "BAD+7C84", "ISO-2022-KR", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Empty string OK
|
||||
ASCII support OK
|
||||
KS X 1001 support OK
|
||||
Escapes behave as expected
|
||||
Done!
|
||||
|
@ -15,6 +15,13 @@ for ($n = 1; $n <= 16; $n++) {
|
||||
continue;
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . "/data/8859-$n.txt", "ISO-8859-{$n}");
|
||||
}
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xAE", "BAD+AE", "ISO8859-7", "UTF-8");
|
||||
convertInvalidString("\xFF", "BAD+FF", "ISO8859-8", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested ISO-8859-1 -> UTF-16BE
|
||||
@ -45,3 +52,4 @@ Tested ISO-8859-15 -> UTF-16BE
|
||||
Tested UTF-16BE -> ISO-8859-15
|
||||
Tested ISO-8859-16 -> UTF-16BE
|
||||
Tested UTF-16BE -> ISO-8859-16
|
||||
Done!
|
||||
|
@ -60,9 +60,18 @@ echo "Unicode -> SJIS-2004 conversion works on all valid characters\n";
|
||||
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
|
||||
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-2004', '%');
|
||||
echo "Unicode -> SJIS-2004 conversion works on all invalid characters\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "SJIS-2004", "UTF-8");
|
||||
convertInvalidString("\x81\x20", "BAD+8120", "SJIS-2004", "UTF-8");
|
||||
convertInvalidString("\xFC\xF5", "BAD+FCF5", "SJIS-2004", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
SJIS-2004 verification and conversion works for all valid characters
|
||||
SJIS-2004 verification and conversion rejects all invalid characters
|
||||
Unicode -> SJIS-2004 conversion works on all valid characters
|
||||
Unicode -> SJIS-2004 conversion works on all invalid characters
|
||||
Done!
|
||||
|
@ -59,9 +59,18 @@ echo "Unicode -> SJIS conversion works on all valid characters\n";
|
||||
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
|
||||
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%');
|
||||
echo "Unicode -> SJIS conversion works on all invalid characters\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "Shift-JIS", "UTF-8");
|
||||
convertInvalidString("\x81\x20", "BAD+8120", "Shift-JIS", "UTF-8");
|
||||
convertInvalidString("\xEA\xA9", "JIS+742B", "Shift-JIS", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
SJIS verification and conversion works on all valid characters
|
||||
SJIS verification and conversion works on all invalid characters
|
||||
Unicode -> SJIS conversion works on all valid characters
|
||||
Unicode -> SJIS conversion works on all invalid characters
|
||||
Done!
|
||||
|
@ -277,6 +277,13 @@ function testSJISVariant($validChars, $nonInvertible, $encoding) {
|
||||
|
||||
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%');
|
||||
echo "Unicode -> $encoding conversion works on all invalid codepoints\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", $encoding, "UTF-8");
|
||||
convertInvalidString("\x81\x20", "BAD+8120", $encoding, "UTF-8");
|
||||
convertInvalidString("\xEA\xA9", "W932+742B", $encoding, "UTF-8");
|
||||
mb_substitute_character(0x25); // '%'
|
||||
}
|
||||
|
||||
testSJISVariant($docomo, $nonInvertibleDocomo, 'SJIS-Mobile#DOCOMO');
|
||||
|
@ -86,9 +86,18 @@ echo "Unicode -> SJIS-mac conversion works on all valid characters\n";
|
||||
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
|
||||
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%');
|
||||
echo "Unicode -> SJIS-mac conversion works on all invalid characters\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x81", "BAD+81", "SJIS-mac", "UTF-8");
|
||||
convertInvalidString("\x81\x20", "BAD+8120", "SJIS-mac", "UTF-8");
|
||||
convertInvalidString("\xED\x9F", "W932+7A21", "SJIS-mac", "UTF-8");
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
MacJapanese verification and conversion works on all valid characters
|
||||
MacJapanese verification and conversion rejects all invalid characters
|
||||
Unicode -> SJIS-mac conversion works on all valid characters
|
||||
Unicode -> SJIS-mac conversion works on all invalid characters
|
||||
Done!
|
||||
|
21
ext/mbstring/tests/ucs2_encoding.phpt
Normal file
21
ext/mbstring/tests/ucs2_encoding.phpt
Normal file
@ -0,0 +1,21 @@
|
||||
--TEST--
|
||||
Test verification and conversion of UCS-2 text
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
|
||||
convertInvalidString("\x00\x01\x02\x03", "\x00U\x00+\x001\x000\x002\x000\x003", "UTF-32BE", "UCS-2BE");
|
||||
convertInvalidString("\x11", "BAD+1100", "UCS-2BE", "UTF-8");
|
||||
|
||||
convertInvalidString("\x00\x01\x02\x03", "U\x00+\x001\x000\x002\x000\x003\x00", "UTF-32BE", "UCS-2LE");
|
||||
convertInvalidString("\x11", "BAD+11", "UCS-2LE", "UTF-8");
|
||||
|
||||
echo "Done!";
|
||||
?>
|
||||
--EXPECT--
|
||||
Done!
|
25
ext/mbstring/tests/ucs4_encoding.phpt
Normal file
25
ext/mbstring/tests/ucs4_encoding.phpt
Normal file
@ -0,0 +1,25 @@
|
||||
--TEST--
|
||||
Test verification and conversion of UCS-4 text
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x6F\x00\x00\x00", "U+6F000000", "UCS-4BE", "UTF-8");
|
||||
convertInvalidString("\x70\x00\x00\x00", "?+0", "UCS-4BE", "UTF-8");
|
||||
convertInvalidString("\x78\x00\x00\x01", "BAD+1", "UCS-4BE", "UTF-8");
|
||||
convertInvalidString("\x80\x01\x02\x03", "BAD+10203", "UCS-4BE", "UTF-8");
|
||||
convertInvalidString("\x00\x01\x02", "BAD+10200", "UCS-4BE", "UTF-8");
|
||||
|
||||
convertInvalidString("\x00\x00\x00\x6F", "U+6F000000", "UCS-4LE", "UTF-8");
|
||||
convertInvalidString("\x00\x00\x00\x70", "?+0", "UCS-4LE", "UTF-8");
|
||||
convertInvalidString("\x01\x00\x00\x78", "BAD+1", "UCS-4LE", "UTF-8");
|
||||
convertInvalidString("\x02\x01\x00", "BAD+102", "UCS-4LE", "UTF-8");
|
||||
|
||||
echo "Done!";
|
||||
?>
|
||||
--EXPECT--
|
||||
Done!
|
@ -10,6 +10,11 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
||||
<?php
|
||||
include('encoding_tests.inc');
|
||||
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP949.txt', 'UHC');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x80", "BAD+80", "UHC", "UTF-8");
|
||||
convertInvalidString("\xA7\xF0", "?+A7F0", "UHC", "UTF-8");
|
||||
?>
|
||||
--EXPECT--
|
||||
Tested UHC -> UTF-16BE
|
||||
|
@ -192,6 +192,14 @@ testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("
|
||||
|
||||
echo "Identification and conversion of valid text is working... perfect!\n";
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x10", "BAD+10", "UTF7-IMAP", "UTF-8");
|
||||
convertInvalidString("\x80", "BAD+80", "UTF7-IMAP", "UTF-8");
|
||||
convertInvalidString("abc&", "abcBAD+0", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it, so the 'bad character' is 'zero'
|
||||
convertInvalidString("&**-", "BAD+2A*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
|
||||
|
||||
echo "Done!\n";
|
||||
?>
|
||||
--EXPECT--
|
||||
Identification passes on empty string... good start!
|
||||
@ -204,3 +212,4 @@ Testing valid strings which use '&-' for '&'... good!
|
||||
Identification fails when Base64 sections contain non-Base64 bytes... right!
|
||||
Identification fails when UTF-16 text is invalid... no sweat!
|
||||
Identification and conversion of valid text is working... perfect!
|
||||
Done!
|
||||
|
@ -799,6 +799,44 @@ $invalid = array(
|
||||
|
||||
testInvalidCodepoints($invalid, 'UTF-8');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xF4\x90\x80\x80", "BAD+F490BAD+80BAD+80", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF7\x80\x80\x80", "BAD+F7BAD+80BAD+80BAD+80", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xED\xA0\x80", "BAD+EDA0BAD+80", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xED\xBF\xBF", "BAD+EDBFBAD+BF", "UTF-8", "ASCII");
|
||||
// Truncated:
|
||||
convertInvalidString("\xDF", "BAD+DF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xEF", "BAD+EF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xEF\xBF", "BAD+EFBF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0", "BAD+F0", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\xBF", "BAD+F0BF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\xBF\xBF", "BAD+F0BFBF", "UTF-8", "ASCII");
|
||||
// Multi-byte character ends too early and goes back to ASCII:
|
||||
convertInvalidString("\xDFA", "BAD+DFA", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xEFA", "BAD+EFA", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xEF\xBFA", "BAD+EFBFA", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0A", "BAD+F0A", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\xBFA", "BAD+F0BFA", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\xBF\xBFA", "BAD+F0BFBFA", "UTF-8", "ASCII");
|
||||
// Multi-byte character ends too early and goes to a byte which is not ASCII, nor could
|
||||
// it possibly start a valid multi-byte character
|
||||
convertInvalidString("\xEF\xBF\xC0", "BAD+EFBFC0", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\xBF\xBF\xC0", "BAD+BFBFC0", "UTF-8", "ASCII");
|
||||
|
||||
convertInvalidString("\xDF\xDF\xBF", "BAD+DFU+7FF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xEF\xBF\xDF\xBF", "BAD+EFBFU+7FF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\xBF\xBF\xDF\xBF", "BAD+F0BFBFU+7FF", "UTF-8", "ASCII");
|
||||
|
||||
convertInvalidString("\x80", "BAD+80", "UTF-8", "ASCII");
|
||||
convertInvalidString(".\x80", ".BAD+80", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xDF\xBF\x80", "U+7FFBAD+80", "UTF-8", "ASCII");
|
||||
|
||||
convertInvalidString("\xC1\xBF", "BAD+C1BAD+BF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xE0\x9F\xBF", "BAD+E09FBAD+BF", "UTF-8", "ASCII");
|
||||
convertInvalidString("\xF0\x8F\xBF\xBF", "BAD+F08FBAD+BFBAD+BF", "UTF-8", "ASCII");
|
||||
mb_substitute_character(0x25); // '%'
|
||||
|
||||
echo "== UTF-16 ==\n";
|
||||
|
||||
testValidCodepoints("UTF-16");
|
||||
@ -849,6 +887,29 @@ testInvalidCodepoints($invalid, 'UTF-16LE');
|
||||
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
|
||||
testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\xDC\x01\xD8\x02", "BAD+DC01BAD+D802", "UTF-16", "ASCII");
|
||||
convertInvalidString("\xDC\x01\xD8\x02", "BAD+DC01BAD+D802", "UTF-16BE", "ASCII");
|
||||
convertInvalidString("\x01\xDC\x02\xD8", "BAD+DC01BAD+D802", "UTF-16LE", "ASCII");
|
||||
convertInvalidString("\xDD\x11\xD9\x13", "BAD+DD11BAD+D913", "UTF-16BE", "ASCII");
|
||||
|
||||
convertInvalidString("\xD8\x01\x00A", "BAD+D801A", "UTF-16", "ASCII");
|
||||
convertInvalidString("\xD8\x01\x00A", "BAD+D801A", "UTF-16BE", "ASCII");
|
||||
convertInvalidString("\x01\xD8A\x00", "BAD+D801A", "UTF-16LE", "ASCII");
|
||||
|
||||
convertInvalidString("\xD8\x01", "BAD+D801", "UTF-16", "ASCII");
|
||||
convertInvalidString("\xD8\x01", "BAD+D801", "UTF-16BE", "ASCII");
|
||||
convertInvalidString("\x01\xD8", "BAD+D801", "UTF-16LE", "ASCII");
|
||||
|
||||
convertInvalidString("\x00", "BAD+0", 'UTF-16', 'ASCII');
|
||||
convertInvalidString("\x00", "BAD+0", 'UTF-16BE', 'ASCII');
|
||||
convertInvalidString("\x00", "BAD+0", 'UTF-16LE', 'ASCII');
|
||||
convertInvalidString("\x00A\x01", "ABAD+1", 'UTF-16', 'ASCII');
|
||||
convertInvalidString("\x00A\x01", "ABAD+1", 'UTF-16BE', 'ASCII');
|
||||
convertInvalidString("A\x00\x01", "ABAD+1", 'UTF-16LE', 'ASCII');
|
||||
mb_substitute_character(0x25); // '%'
|
||||
|
||||
// TODO: test handling of UTF-16 BOM
|
||||
|
||||
echo "== UTF-32 ==\n";
|
||||
@ -905,6 +966,24 @@ testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
|
||||
testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
|
||||
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
|
||||
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString("\x00\x01\x01", "BAD+101", "UTF-32", "ASCII");
|
||||
convertInvalidString("\x00\x01\x01", "BAD+101", "UTF-32BE", "ASCII");
|
||||
convertInvalidString("\x01\x01\x00", "BAD+101", "UTF-32LE", "ASCII");
|
||||
|
||||
convertInvalidString("\x01", "BAD+1", "UTF-32", "ASCII");
|
||||
convertInvalidString("\x01", "BAD+1", "UTF-32BE", "ASCII");
|
||||
convertInvalidString("\x01", "BAD+1", "UTF-32LE", "ASCII");
|
||||
|
||||
convertInvalidString("\x00\x11\x00\x00", "BAD+110000", "UTF-32", "ASCII");
|
||||
convertInvalidString("\x00\x11\x00\x00", "BAD+110000", "UTF-32BE", "ASCII");
|
||||
convertInvalidString("\x00\x00\x11\x00", "BAD+110000", "UTF-32LE", "ASCII");
|
||||
|
||||
convertInvalidString("\x00\x00\xd8\x00", "BAD+D800", "UTF-32", "ASCII");
|
||||
convertInvalidString("\x00\x00\xd8\x00", "BAD+D800", "UTF-32BE", "ASCII");
|
||||
convertInvalidString("\x00\xd8\x00\x00", "BAD+D800", "UTF-32LE", "ASCII");
|
||||
mb_substitute_character(0x25); // '%'
|
||||
|
||||
// TODO: test handling of UTF-32 BOM
|
||||
|
||||
echo "== UTF-7 ==\n";
|
||||
@ -1012,6 +1091,28 @@ $encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding.
|
||||
$corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1);
|
||||
testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
|
||||
|
||||
// Test "long" illegal character markers
|
||||
mb_substitute_character("long");
|
||||
convertInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "BAD+DC01BAD+D802", "UTF-7", "UTF-8");
|
||||
convertInvalidString('+' . rawEncode("\xDC\x01\xD8\x02"), "BAD+DC01BAD+D802", "UTF-7", "UTF-8");
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\xDC\x01\xD8\x02") . '-', ".BAD+DC01BAD+D802", "UTF-7", "UTF-8");
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\xDC\x01\xD8\x02"), ".BAD+DC01BAD+D802", "UTF-7", "UTF-8");
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xDC\x01\xD8\x02") . '-', "..BAD+DC01BAD+D802", "UTF-7", "UTF-8");
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xDC\x01\xD8\x02"), "..BAD+DC01BAD+D802", "UTF-7", "UTF-8");
|
||||
|
||||
convertInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "BAD+D801A", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\x00A") . '-', ".BAD+D801A", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\x00A") . '-', "..BAD+D801A", 'UTF-7', 'UTF-8');
|
||||
|
||||
convertInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\xD8\x01\xD9\x02"), "BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\xD9\x02") . '-', ".BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\xD9\x02"), ".BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\xD9\x02") . '-', "..BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
|
||||
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\xD9\x02"), "..BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
|
||||
|
||||
convertInvalidString('+' . rawEncode("\x01") . '-', "BAD+100", 'UTF-7', 'UTF-8');
|
||||
|
||||
echo "Done!\n";
|
||||
|
||||
?>
|
||||
|
Loading…
Reference in New Issue
Block a user