Test behavior of 'long' illegal character markers

After mb_substitute_character("long"), mbstring will respond to
erroneous input by inserting 'long' error markers into the output.
Depending on the situation, these error markers will either look like
BAD+XXXX (for general bad input), U+XXXX (when the input is OK, but it
converts to Unicode codepoints which cannot be represented in the
output encoding), or an encoding-specific marker like JISX+XXXX or
W932+XXXX.

We have almost no tests for this feature. Add a bunch of tests to
ensure that all our legacy encoding handlers work in a reasonable
way when 'long' error markers are enabled.
This commit is contained in:
Alex Dowad 2021-07-27 13:21:48 +02:00
parent f6f0506c84
commit 51b9d7a5e1
41 changed files with 539 additions and 67 deletions

View File

@ -151,9 +151,7 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
filter->status = 1;
filter->cache = c;
} else {
w = c & MBFL_WCSGROUP_MASK;
w |= MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data));
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
}
break;
@ -215,7 +213,7 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status) {
(*filter->filter_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter);
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
}
if (filter->flush_function) {

View File

@ -148,7 +148,7 @@ int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
filter->cache = (c1 << 8) + c - 0xA1;
} else {
filter->status = filter->cache = 0;
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
w = 0x8E0000 | ((c1 + 0xA1) << 8) | c | MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data));
}
break;
@ -179,7 +179,7 @@ int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
CK((*filter->output_function)(w, filter->data));
} else {
filter->status = filter->cache = 0;
w = (c1 << 8) | c | 0x8e0000 | MBFL_WCSGROUP_THROUGH;
w = ((c1 + 0xA1A1) << 8) | c | MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data));
}
break;

View File

@ -174,8 +174,16 @@ int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status & 0xF) {
/* 2-byte character or escape sequence was truncated */
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
if (filter->status == 2) {
CK((*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data));
} else if (filter->status == 3) {
CK((*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data));
} else if (filter->status == 4) {
CK((*filter->output_function)(0x1B2429 | MBFL_WCSGROUP_THROUGH, filter->data));
} else {
/* 2-byte character was truncated */
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
}
}
if (filter->flush_function) {

View File

@ -241,7 +241,13 @@ int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status & 0xF) {
mbfl_filt_conv_illegal_output(filter->cache, filter);
if ((filter->status & 0xF) == 2) {
(*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 3) {
(*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data);
} else {
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
}
}
if (filter->flush_function) {

View File

@ -155,9 +155,7 @@ retry:
w = 0;
}
if (w <= 0) {
w = (c1 << 8) | c;
w &= MBFL_WCSPLANE_MASK;
w |= MBFL_WCSPLANE_JIS0208;
w = (c1 << 8) | c | MBFL_WCSPLANE_JIS0208;
}
} else {
if (s >= 0 && s < jisx0212_ucs_table_size) {
@ -166,16 +164,12 @@ retry:
w = 0;
}
if (w <= 0) {
w = (c1 << 8) | c;
w &= MBFL_WCSPLANE_MASK;
w |= MBFL_WCSPLANE_JIS0212;
w = (c1 << 8) | c | MBFL_WCSPLANE_JIS0212;
}
}
CK((*filter->output_function)(w, filter->data));
} else {
w = (c1 << 8) | c;
w &= MBFL_WCSGROUP_MASK;
w |= MBFL_WCSGROUP_THROUGH;
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(w, filter->data));
}
break;

View File

@ -188,14 +188,14 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
s1 = c1 - 0x80;
s2 = c - 0x80;
} else {
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
break;
}
} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
SJIS_DECODE(c1, c, s1, s2);
} else {
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
break;
}
} else { /* ISO-2022-JP-2004 */
@ -203,7 +203,7 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
s1 = c1;
s2 = c;
} else {
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
break;
}
}

View File

@ -195,7 +195,7 @@ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
if (n >= 0xD800 && n <= 0xDBFF) {
/* Wrong; that's the first half of a surrogate pair, not the second */
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(0xD800 | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = n & 0x3FF;
filter->status = 2;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
@ -203,7 +203,7 @@ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
} else {
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(0xD800 | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
}
@ -269,7 +269,7 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
case 3:
n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
if (n >= 0xD800 && n <= 0xDBFF) {
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(0xD800 | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = n & 0x3FF;
filter->status = 2;
} else if (n >= 0xDC00 && n <= 0xDFFF) {
@ -277,7 +277,7 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
} else {
CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(0xD800 | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
CK((*filter->output_function)(n, filter->data));
filter->status = 0;
}
@ -316,7 +316,11 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
if (status) {
/* Input string was truncated */
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
if (status == 1) {
CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
} else if (status == 2) {
CK((*filter->output_function)(0xD800 | cache | MBFL_WCSGROUP_THROUGH, filter->data));
}
}
if (filter->flush_function) {

View File

@ -107,7 +107,11 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
if (filter->cache) {
/* Either we were expecting the 2nd half of a surrogate pair which
* never came, or else the last Base64 data was not padded with zeroes */
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
if (filter->cache & 0xfff0000) {
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
} else {
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
}
}
if (c == '-') {
if (filter->status == 1) { /* "+-" -> "+" */
@ -150,25 +154,32 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
n = (n & 0x3) << 14;
filter->status = 5;
if (s >= 0xd800 && s < 0xdc00) {
/* 1st part of surrogate pair */
if (filter->cache & 0xfff0000) {
/* We were waiting for the 2nd part of a surrogate pair */
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
}
s = (((s & 0x3ff) << 16) + 0x400000) | n;
filter->cache = s;
} else if (s >= 0xdc00 && s < 0xe000) {
s &= 0x3ff;
s |= (filter->cache & 0xfff0000) >> 6;
filter->cache = n;
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(s, filter->data));
} else { /* illegal character */
/* 2nd part of surrogate pair */
if (filter->cache & 0xfff0000) {
s &= 0x3ff;
s |= (filter->cache & 0xfff0000) >> 6;
filter->cache = n;
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(s, filter->data));
} else { /* illegal character */
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
}
} else {
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = n;
}
} else {
if (filter->cache & 0xfff0000) {
/* We were waiting for the 2nd part of a surrogate pair */
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
}
filter->cache = n;
CK((*filter->output_function)(s, filter->data));
@ -190,23 +201,29 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
if (s >= 0xd800 && s < 0xdc00) {
if (filter->cache & 0xfff0000) {
/* We were waiting for the 2nd part of a surrogate pair */
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
}
s = (((s & 0x3ff) << 16) + 0x400000) | n;
filter->cache = s;
} else if (s >= 0xdc00 && s < 0xe000) {
s &= 0x3ff;
s |= (filter->cache & 0xfff0000) >> 6;
filter->cache = n;
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(s, filter->data));
} else { /* illegal character */
/* 2nd part of surrogate pair */
if (filter->cache & 0xfff0000) {
s &= 0x3ff;
s |= (filter->cache & 0xfff0000) >> 6;
filter->cache = n;
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(s, filter->data));
} else { /* illegal character */
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
}
} else {
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = n;
}
} else {
if (filter->cache & 0xfff0000) {
/* We were waiting for the 2nd part of a surrogate pair */
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
}
filter->cache = n;
CK((*filter->output_function)(s, filter->data));
@ -223,25 +240,28 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
if (s >= 0xd800 && s < 0xdc00) {
if (filter->cache & 0xfff0000) {
/* We were waiting for the 2nd part of a surrogate pair */
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
}
s = (((s & 0x3ff) << 16) + 0x400000);
filter->cache = s;
} else if (s >= 0xdc00 && s < 0xe000) {
s &= 0x3ff;
s |= (filter->cache & 0xfff0000) >> 6;
filter->cache = 0;
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(s, filter->data));
} else { /* illegal character */
s &= MBFL_WCSGROUP_MASK;
s |= MBFL_WCSGROUP_THROUGH;
CK((*filter->output_function)(s, filter->data));
if (filter->cache & 0xfff0000) {
s &= 0x3ff;
s |= (filter->cache & 0xfff0000) >> 6;
filter->cache = 0;
if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
CK((*filter->output_function)(s, filter->data));
} else { /* illegal character */
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
}
} else {
CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
filter->cache = 0;
}
} else {
if (filter->cache & 0xfff0000) {
/* We were waiting for the 2nd part of a surrogate pair */
(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
}
filter->cache = 0;
CK((*filter->output_function)(s, filter->data));
@ -261,7 +281,11 @@ static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter)
if (filter->cache) {
/* Either we were expecting the 2nd half of a surrogate pair which
* never came, or else the last Base64 data was not padded with zeroes */
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
if (filter->cache & 0xfff0000) {
(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
} else {
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
}
}
if (filter->flush_function) {

View File

@ -96,7 +96,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
int s, c1;
retry:
switch (filter->status & 0xff) {
switch (filter->status) {
case 0x00:
if (c < 0x80) {
CK((*filter->output_function)(c, filter->data));
@ -116,15 +116,31 @@ retry:
case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
case 0x32: /* 4byte code 4th char: 0x80-0xbf */
filter->status = 0;
if (c >= 0x80 && c <= 0xbf) {
s = (filter->cache<<6) | (c & 0x3f);
filter->cache = 0;
filter->status = filter->cache = 0;
CK((*filter->output_function)(s, filter->data));
} else {
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
int status = filter->status;
filter->status = 0;
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
if (status == 0x10) {
CK(mbfl_filt_put_invalid_char(0xC0 | filter->cache, filter));
} else if (status == 0x21) {
CK(mbfl_filt_put_invalid_char(0xE080 | ((filter->cache & ~0x3F) << 2) | (filter->cache & 0x3F), filter));
} else {
CK(mbfl_filt_put_invalid_char(0xF08080 | ((filter->cache & ~0xFFF) << 4) | ((filter->cache & 0xFC0) << 2) | (filter->cache & 0x3F), filter));
}
goto retry;
} else {
if (status == 0x10) {
CK(mbfl_filt_put_invalid_char(0xC000 | (filter->cache << 8) | c, filter));
} else if (status == 0x21) {
CK(mbfl_filt_put_invalid_char(0xE08000 | ((filter->cache & ~0x3F) << 10) | ((filter->cache & 0x3F) << 8) | c, filter));
} else {
CK(mbfl_filt_put_invalid_char(0x808000 | ((filter->cache & 0xFC0) << 10) | ((filter->cache & 0x3F) << 8) | c, filter));
}
}
}
break;
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@ -138,9 +154,13 @@ retry:
filter->cache = s;
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
CK(mbfl_filt_put_invalid_char(0xE0 | filter->cache, filter));
goto retry;
} else {
CK(mbfl_filt_put_invalid_char(0xE000 | (filter->cache << 8) | c, filter));
filter->status = 0;
}
}
break;
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@ -154,9 +174,13 @@ retry:
filter->cache = s;
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
CK(mbfl_filt_put_invalid_char(0xF0 | filter->cache, filter));
goto retry;
} else {
CK(mbfl_filt_put_invalid_char(0xF000 | (filter->cache << 8) | c, filter));
filter->status = 0;
}
}
break;
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@ -164,9 +188,13 @@ retry:
filter->cache = (filter->cache<<6) | (c & 0x3f);
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
CK(mbfl_filt_put_invalid_char(0xF080 | ((filter->cache & ~0x3F) << 2) | (filter->cache & 0x3F), filter));
goto retry;
} else {
CK(mbfl_filt_put_invalid_char(0xF000 | (filter->cache << 8) | c, filter));
filter->status = 0;
}
}
break;
default:
@ -184,7 +212,19 @@ int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
filter->status = filter->cache = 0;
if (status) {
CK(mbfl_filt_put_invalid_char(cache, filter));
if (status == 0x10) {
CK(mbfl_filt_put_invalid_char(0xC0 | cache, filter));
} else if (status == 0x20) {
CK(mbfl_filt_put_invalid_char(0xE0 | cache, filter));
} else if (status == 0x21) {
CK(mbfl_filt_put_invalid_char(0xE080 | ((cache & ~0x3F) << 2) | (cache & 0x3F), filter));
} else if (status == 0x30) {
CK(mbfl_filt_put_invalid_char(0xF0 | cache, filter));
} else if (status == 0x31) {
CK(mbfl_filt_put_invalid_char(0xF080 | ((cache & ~0x3F) << 2) | (cache & 0x3F), filter));
} else if (status == 0x32) {
CK(mbfl_filt_put_invalid_char(0xF08080 | ((cache & ~0xFFF) << 4) | ((cache & 0xFC0) << 2) | (cache & 0x3F), filter));
}
}
if (filter->flush_function) {

View File

@ -27,7 +27,15 @@ echo "Tested ARMSCII-8 -> UTF-16BE\n";
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'ARMSCII-8', '%');
echo "Tested UTF-16BE -> ARMSCII-8\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xA1", "BAD+A1", "ARMSCII-8", "UTF-8");
convertInvalidString("\xFF", "BAD+FF", "ARMSCII-8", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested ARMSCII-8 -> UTF-16BE
Tested UTF-16BE -> ARMSCII-8
Done!

View File

@ -33,7 +33,17 @@ testAllValidChars($fromUnicode, 'UTF-16BE', 'BIG5', false);
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'BIG5', '%');
echo "Tested UTF-16BE -> BIG5\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "BIG5", "UTF-8");
convertInvalidString("\xB0\x9F", "BAD+B09F", "BIG5", "UTF-8");
convertInvalidString("\xA3\xED", "?+A3ED", "BIG5", "UTF-8");
convertInvalidString("\x76\x54", "U+7654", "UTF-16BE", "BIG5");
echo "Done!\n";
?>
--EXPECT--
Tested BIG5 -> UTF-16BE
Tested UTF-16BE -> BIG5
Done!

View File

@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
<?php
include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1251.txt', 'CP1251');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x98", "BAD+98", "CP1251", "UTF-8");
convertInvalidString("\x12\x34", "U+1234", "UTF-16BE", "CP1251");
echo "Done!\n";
?>
--EXPECT--
Tested CP1251 -> UTF-16BE
Tested UTF-16BE -> CP1251
Done!

View File

@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
<?php
include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81", "BAD+81", "CP1252", "UTF-8");
convertInvalidString("\x9D", "BAD+9D", "CP1252", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested CP1252 -> UTF-16BE
Tested UTF-16BE -> CP1252
Done!

View File

@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
<?php
include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1254.txt', 'CP1254');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81", "BAD+81", "CP1254", "UTF-8");
convertInvalidString("\x9E", "BAD+9E", "CP1254", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested CP1254 -> UTF-16BE
Tested UTF-16BE -> CP1254
Done!

View File

@ -292,6 +292,16 @@ testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');
echo "Invalid Unicode is flagged when converting to CP5022x\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "CP50220", "UTF-8");
convertInvalidString("\x80", "BAD+80", "CP50221", "UTF-8");
convertInvalidString("\x80", "BAD+80", "CP50222", "UTF-8");
convertInvalidString("\x1B\$B1", "BAD+31", "CP50220", "UTF-8");
convertInvalidString("\x1B\$B1", "BAD+31", "CP50221", "UTF-8");
convertInvalidString("\x1B\$B1", "BAD+31", "CP50222", "UTF-8");
echo "Long error markers OK\n";
?>
--EXPECT--
ASCII support OK
@ -299,3 +309,4 @@ JIS X 0201 support OK
CP932 support OK
Folding of fullwidth katakana for CP50220 OK
Invalid Unicode is flagged when converting to CP5022x
Long error markers OK

View File

@ -108,8 +108,16 @@ echo "CP51932 verification and conversion works on all invalid characters\n";
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "CP51932", "UTF-8");
convertInvalidString("\xFE\xFF", "BAD+FEFF", "CP51932", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
CP51932 verification and conversion works on all valid characters
CP51932 verification and conversion works on all invalid characters
Unicode -> CP51932 conversion works on all invalid codepoints
Done!

View File

@ -105,8 +105,18 @@ echo "CP932 verification and conversion works on all invalid characters\n";
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "CP932", "UTF-8");
convertInvalidString("\xEA", "BAD+EA", "CP932", "UTF-8");
convertInvalidString("\x81\x20", "BAD+8120", "CP932", "UTF-8");
convertInvalidString("\xEA\xA9", "W932+742B", "CP932", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
CP932 verification and conversion works on all valid characters
CP932 verification and conversion works on all invalid characters
Unicode -> CP932 conversion works on all invalid codepoints
Done!

View File

@ -324,7 +324,15 @@ findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF),
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP936', '%');
echo "Tested UTF-16BE -> CP936\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81\x20", "BAD+8120", "CP936", "UTF-8");
convertInvalidString("\x81\x7F", "BAD+817F", "CP936", "UTF-8");
convertInvalidString("\xFE\xFF", "BAD+FEFF", "CP936", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested CP936 -> UTF-16BE
Tested UTF-16BE -> CP936
Done!

View File

@ -75,7 +75,15 @@ testAllValidChars($fromUnicode, 'UTF-16BE', 'CP950', false);
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP950', '%');
echo "Tested UTF-16BE -> CP950\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "CP950", "UTF-8");
convertInvalidString("\x26\x09", "U+2609", "UTF-16BE", "CP950");
echo "Done!\n";
?>
--EXPECT--
Tested CP950 -> UTF-16BE
Tested UTF-16BE -> CP950
Done!

View File

@ -10,7 +10,16 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
<?php
include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/EUC-CN.txt', 'EUC-CN');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "EUC-CN", "UTF-8");
convertInvalidString("\xA1\x50", "BAD+A150", "EUC-CN", "UTF-8");
convertInvalidString("\xF7\xFF", "BAD+F7FF", "EUC-CN", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested EUC-CN -> UTF-16BE
Tested UTF-16BE -> EUC-CN
Done!

View File

@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
<?php
include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/EUC-KR.txt', 'EUC-KR');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "EUC-KR", "UTF-8");
convertInvalidString("\xA7\xF0", "?+A7F0", "EUC-KR", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested EUC-KR -> UTF-16BE
Tested UTF-16BE -> EUC-KR
Done!

View File

@ -25,7 +25,20 @@ findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF),
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'EUC-TW', '%');
echo "Tested UTF-16BE -> EUC-TW\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "EUC-TW", "UTF-8");
convertInvalidString("\x8E\x20", "BAD+8E20", "EUC-TW", "UTF-8");
convertInvalidString("\x8E\xA1\x20", "BAD+8EA120", "EUC-TW", "UTF-8");
convertInvalidString("\x8E\xA1\xA1\x20", "BAD+A1A120", "EUC-TW", "UTF-8");
convertInvalidString("\x8E\xA2\xA3\x20", "BAD+A2A320", "EUC-TW", "UTF-8");
convertInvalidString("\x8F", "BAD+8F", "EUC-TW", "UTF-8");
convertInvalidString("\xA1\x50", "BAD+A150", "EUC-TW", "UTF-8");
convertInvalidString("\xFD\xCC", "?+FDCC", "EUC-TW", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested EUC-TW -> UTF-16BE
Tested UTF-16BE -> EUC-TW
Done!

View File

@ -69,9 +69,17 @@ echo "Unicode -> EUC-JP-2004 conversion works on all valid characters\n";
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'EUC-JP-2004', '%');
echo "Unicode -> EUC-JP-2004 conversion works on all invalid characters\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "EUC-JP-2004", "UTF-8");
convertInvalidString("\xFE\xFF", "BAD+FEFF", "EUC-JP-2004", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
EUC-JP-2004 verification and conversion works for all valid characters
EUC-JP-2004 verification and conversion rejects all invalid characters
Unicode -> EUC-JP-2004 conversion works on all valid characters
Unicode -> EUC-JP-2004 conversion works on all invalid characters
Done!

View File

@ -76,6 +76,13 @@ for ($cp = 0; $cp <= 0xFFFF; $cp++) {
}
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-32BE', 'EUC-JP', '%');
echo "Unicode -> EUC-JP conversion works on all invalid characters\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "EUC-JP", "UTF-8");
convertInvalidString("\xFE\xFF", "BAD+FEFF", "EUC-JP", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Encoding verification and conversion work for all valid characters
@ -83,3 +90,4 @@ Encoding verification and conversion work for all invalid characters
Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly
Unicode -> EUC-JP conversion works on all valid characters
Unicode -> EUC-JP conversion works on all invalid characters
Done!

View File

@ -300,8 +300,15 @@ echo "Tested GB18030 4-byte characters <-> UTF-16BE\n";
testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false);
echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81\x30\x81\xFF", "BAD+3081FF", "GB18030", "UTF-8");
convertInvalidString("\xE3\x32\x9A\x36", "BAD+329A36", "GB18030", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested GB18030 (1 and 2 byte characters) -> UTF-16BE
Tested GB18030 4-byte characters <-> UTF-16BE
Tested UTF-16BE -> GB18030 (1 and 2 byte characters)
Done!

View File

@ -118,6 +118,13 @@ while (!empty($badChars)) {
echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("~A", "BAD+41", "HZ", "UTF-8");
convertInvalidString("\x80", "BAD+80", "HZ", "UTF-8");
convertInvalidString("~{\x22\x21", "?+2221", "HZ", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested ASCII -> HZ
@ -127,3 +134,4 @@ Tested valid ~ escapes
Tested all invalid ~ escapes
Tested HZ -> UTF-16BE (for all GB2312 characters)
Tested UTF-16BE -> HZ (for all GB2312 characters)
Done!

View File

@ -314,6 +314,12 @@ for ($i = 0; $i < 100; $i++) {
testValid($testString, $convertsTo, false);
}
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-2004", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+1B2428", "ISO-2022-JP-2004", "UTF-8"); // Invalid escape
convertInvalidString("\x1B\$B!", "BAD+21", "ISO-2022-JP-2004", "UTF-8"); // Truncated character
echo "All done!\n";
?>

View File

@ -187,6 +187,16 @@ for ($i = 0; $i <= 0xFF; $i++) {
echo "All escape sequences work as expected\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xE0", "BAD+E0", "JIS", "UTF-8");
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+1B\$(X", "JIS", "UTF-8"); // Invalid escape
convertInvalidString("\x1B\$(X", "BAD+1B\$(X", "ISO-2022-JP", "UTF-8"); // Invalid escape
convertInvalidString("\x1B\$B!", "BAD+21", "JIS", "UTF-8"); // Truncated character
convertInvalidString("\x1B\$B!", "BAD+21", "ISO-2022-JP", "UTF-8"); // Truncated character
echo "Done!\n";
?>
--EXPECT--
ASCII support OK
@ -194,3 +204,4 @@ JIS X 0201 support OK
JIS X 0208 support OK
JIS X 0212 support OK
All escape sequences work as expected
Done!

View File

@ -199,8 +199,21 @@ foreach (array_keys($truncatedChars) as $truncated)
echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-KDDI", "UTF-8");
// Invalid escapes:
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-JP-KDDI", "UTF-8");
convertInvalidString("\x1B.", "BAD+1B2E", "ISO-2022-JP-KDDI", "UTF-8");
convertInvalidString("\x1B\$", "BAD+1B24", "ISO-2022-JP-KDDI", "UTF-8");
convertInvalidString("\x1B\$.", "BAD+1B242E", "ISO-2022-JP-KDDI", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-KDDI", "UTF-8");
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character
echo "Done!\n";
?>
--EXPECT--
ASCII support OK
JIS X 0201 support OK
JIS X 0208 (with MS extensions) and KDDI emoji support OK
Done!

View File

@ -198,9 +198,17 @@ foreach (array_keys($truncatedChars) as $truncated)
echo "UDC support OK\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8"); // Invalid escape
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character
echo "Done!\n";
?>
--EXPECT--
ASCII support OK
JIS X 0201 support OK
JIS X 0208 (with MS extensions) support OK
UDC support OK
Done!

View File

@ -96,9 +96,18 @@ testValid("\x0E\x0E\x0F\x0E\x0Fabc", "\x00a\x00b\x00c", false);
echo "Escapes behave as expected\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-KR", "UTF-8");
convertInvalidString("\x1B$", "BAD+1B24", "ISO-2022-KR", "UTF-8");
convertInvalidString("\x1B$)", "BAD+1B2429", "ISO-2022-KR", "UTF-8");
convertInvalidString("\x1B$)C\x0E\x7C\x84", "BAD+7C84", "ISO-2022-KR", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Empty string OK
ASCII support OK
KS X 1001 support OK
Escapes behave as expected
Done!

View File

@ -15,6 +15,13 @@ for ($n = 1; $n <= 16; $n++) {
continue;
testEncodingFromUTF16ConversionTable(__DIR__ . "/data/8859-$n.txt", "ISO-8859-{$n}");
}
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xAE", "BAD+AE", "ISO8859-7", "UTF-8");
convertInvalidString("\xFF", "BAD+FF", "ISO8859-8", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
Tested ISO-8859-1 -> UTF-16BE
@ -45,3 +52,4 @@ Tested ISO-8859-15 -> UTF-16BE
Tested UTF-16BE -> ISO-8859-15
Tested ISO-8859-16 -> UTF-16BE
Tested UTF-16BE -> ISO-8859-16
Done!

View File

@ -60,9 +60,18 @@ echo "Unicode -> SJIS-2004 conversion works on all valid characters\n";
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-2004', '%');
echo "Unicode -> SJIS-2004 conversion works on all invalid characters\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "SJIS-2004", "UTF-8");
convertInvalidString("\x81\x20", "BAD+8120", "SJIS-2004", "UTF-8");
convertInvalidString("\xFC\xF5", "BAD+FCF5", "SJIS-2004", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
SJIS-2004 verification and conversion works for all valid characters
SJIS-2004 verification and conversion rejects all invalid characters
Unicode -> SJIS-2004 conversion works on all valid characters
Unicode -> SJIS-2004 conversion works on all invalid characters
Done!

View File

@ -59,9 +59,18 @@ echo "Unicode -> SJIS conversion works on all valid characters\n";
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%');
echo "Unicode -> SJIS conversion works on all invalid characters\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "Shift-JIS", "UTF-8");
convertInvalidString("\x81\x20", "BAD+8120", "Shift-JIS", "UTF-8");
convertInvalidString("\xEA\xA9", "JIS+742B", "Shift-JIS", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
SJIS verification and conversion works on all valid characters
SJIS verification and conversion works on all invalid characters
Unicode -> SJIS conversion works on all valid characters
Unicode -> SJIS conversion works on all invalid characters
Done!

View File

@ -277,6 +277,13 @@ function testSJISVariant($validChars, $nonInvertible, $encoding) {
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%');
echo "Unicode -> $encoding conversion works on all invalid codepoints\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", $encoding, "UTF-8");
convertInvalidString("\x81\x20", "BAD+8120", $encoding, "UTF-8");
convertInvalidString("\xEA\xA9", "W932+742B", $encoding, "UTF-8");
mb_substitute_character(0x25); // '%'
}
testSJISVariant($docomo, $nonInvertibleDocomo, 'SJIS-Mobile#DOCOMO');

View File

@ -86,9 +86,18 @@ echo "Unicode -> SJIS-mac conversion works on all valid characters\n";
findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%');
echo "Unicode -> SJIS-mac conversion works on all invalid characters\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81", "BAD+81", "SJIS-mac", "UTF-8");
convertInvalidString("\x81\x20", "BAD+8120", "SJIS-mac", "UTF-8");
convertInvalidString("\xED\x9F", "W932+7A21", "SJIS-mac", "UTF-8");
echo "Done!\n";
?>
--EXPECT--
MacJapanese verification and conversion works on all valid characters
MacJapanese verification and conversion rejects all invalid characters
Unicode -> SJIS-mac conversion works on all valid characters
Unicode -> SJIS-mac conversion works on all invalid characters
Done!

View File

@ -0,0 +1,21 @@
--TEST--
Test verification and conversion of UCS-2 text
--EXTENSIONS--
mbstring
--FILE--
<?php
include('encoding_tests.inc');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x00\x01\x02\x03", "\x00U\x00+\x001\x000\x002\x000\x003", "UTF-32BE", "UCS-2BE");
convertInvalidString("\x11", "BAD+1100", "UCS-2BE", "UTF-8");
convertInvalidString("\x00\x01\x02\x03", "U\x00+\x001\x000\x002\x000\x003\x00", "UTF-32BE", "UCS-2LE");
convertInvalidString("\x11", "BAD+11", "UCS-2LE", "UTF-8");
echo "Done!";
?>
--EXPECT--
Done!

View File

@ -0,0 +1,25 @@
--TEST--
Test verification and conversion of UCS-4 text
--EXTENSIONS--
mbstring
--FILE--
<?php
include('encoding_tests.inc');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x6F\x00\x00\x00", "U+6F000000", "UCS-4BE", "UTF-8");
convertInvalidString("\x70\x00\x00\x00", "?+0", "UCS-4BE", "UTF-8");
convertInvalidString("\x78\x00\x00\x01", "BAD+1", "UCS-4BE", "UTF-8");
convertInvalidString("\x80\x01\x02\x03", "BAD+10203", "UCS-4BE", "UTF-8");
convertInvalidString("\x00\x01\x02", "BAD+10200", "UCS-4BE", "UTF-8");
convertInvalidString("\x00\x00\x00\x6F", "U+6F000000", "UCS-4LE", "UTF-8");
convertInvalidString("\x00\x00\x00\x70", "?+0", "UCS-4LE", "UTF-8");
convertInvalidString("\x01\x00\x00\x78", "BAD+1", "UCS-4LE", "UTF-8");
convertInvalidString("\x02\x01\x00", "BAD+102", "UCS-4LE", "UTF-8");
echo "Done!";
?>
--EXPECT--
Done!

View File

@ -10,6 +10,11 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
<?php
include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP949.txt', 'UHC');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "BAD+80", "UHC", "UTF-8");
convertInvalidString("\xA7\xF0", "?+A7F0", "UHC", "UTF-8");
?>
--EXPECT--
Tested UHC -> UTF-16BE

View File

@ -192,6 +192,14 @@ testValid("123&" . mBase64(utf16BE("")) . "-abc&" . mBase64(utf16BE("
echo "Identification and conversion of valid text is working... perfect!\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x10", "BAD+10", "UTF7-IMAP", "UTF-8");
convertInvalidString("\x80", "BAD+80", "UTF7-IMAP", "UTF-8");
convertInvalidString("abc&", "abcBAD+0", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it, so the 'bad character' is 'zero'
convertInvalidString("&**-", "BAD+2A*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
echo "Done!\n";
?>
--EXPECT--
Identification passes on empty string... good start!
@ -204,3 +212,4 @@ Testing valid strings which use '&-' for '&'... good!
Identification fails when Base64 sections contain non-Base64 bytes... right!
Identification fails when UTF-16 text is invalid... no sweat!
Identification and conversion of valid text is working... perfect!
Done!

View File

@ -799,6 +799,44 @@ $invalid = array(
testInvalidCodepoints($invalid, 'UTF-8');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xF4\x90\x80\x80", "BAD+F490BAD+80BAD+80", "UTF-8", "ASCII");
convertInvalidString("\xF7\x80\x80\x80", "BAD+F7BAD+80BAD+80BAD+80", "UTF-8", "ASCII");
convertInvalidString("\xED\xA0\x80", "BAD+EDA0BAD+80", "UTF-8", "ASCII");
convertInvalidString("\xED\xBF\xBF", "BAD+EDBFBAD+BF", "UTF-8", "ASCII");
// Truncated:
convertInvalidString("\xDF", "BAD+DF", "UTF-8", "ASCII");
convertInvalidString("\xEF", "BAD+EF", "UTF-8", "ASCII");
convertInvalidString("\xEF\xBF", "BAD+EFBF", "UTF-8", "ASCII");
convertInvalidString("\xF0", "BAD+F0", "UTF-8", "ASCII");
convertInvalidString("\xF0\xBF", "BAD+F0BF", "UTF-8", "ASCII");
convertInvalidString("\xF0\xBF\xBF", "BAD+F0BFBF", "UTF-8", "ASCII");
// Multi-byte character ends too early and goes back to ASCII:
convertInvalidString("\xDFA", "BAD+DFA", "UTF-8", "ASCII");
convertInvalidString("\xEFA", "BAD+EFA", "UTF-8", "ASCII");
convertInvalidString("\xEF\xBFA", "BAD+EFBFA", "UTF-8", "ASCII");
convertInvalidString("\xF0A", "BAD+F0A", "UTF-8", "ASCII");
convertInvalidString("\xF0\xBFA", "BAD+F0BFA", "UTF-8", "ASCII");
convertInvalidString("\xF0\xBF\xBFA", "BAD+F0BFBFA", "UTF-8", "ASCII");
// Multi-byte character ends too early and goes to a byte which is not ASCII, nor could
// it possibly start a valid multi-byte character
convertInvalidString("\xEF\xBF\xC0", "BAD+EFBFC0", "UTF-8", "ASCII");
convertInvalidString("\xF0\xBF\xBF\xC0", "BAD+BFBFC0", "UTF-8", "ASCII");
convertInvalidString("\xDF\xDF\xBF", "BAD+DFU+7FF", "UTF-8", "ASCII");
convertInvalidString("\xEF\xBF\xDF\xBF", "BAD+EFBFU+7FF", "UTF-8", "ASCII");
convertInvalidString("\xF0\xBF\xBF\xDF\xBF", "BAD+F0BFBFU+7FF", "UTF-8", "ASCII");
convertInvalidString("\x80", "BAD+80", "UTF-8", "ASCII");
convertInvalidString(".\x80", ".BAD+80", "UTF-8", "ASCII");
convertInvalidString("\xDF\xBF\x80", "U+7FFBAD+80", "UTF-8", "ASCII");
convertInvalidString("\xC1\xBF", "BAD+C1BAD+BF", "UTF-8", "ASCII");
convertInvalidString("\xE0\x9F\xBF", "BAD+E09FBAD+BF", "UTF-8", "ASCII");
convertInvalidString("\xF0\x8F\xBF\xBF", "BAD+F08FBAD+BFBAD+BF", "UTF-8", "ASCII");
mb_substitute_character(0x25); // '%'
echo "== UTF-16 ==\n";
testValidCodepoints("UTF-16");
@ -849,6 +887,29 @@ testInvalidCodepoints($invalid, 'UTF-16LE');
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xDC\x01\xD8\x02", "BAD+DC01BAD+D802", "UTF-16", "ASCII");
convertInvalidString("\xDC\x01\xD8\x02", "BAD+DC01BAD+D802", "UTF-16BE", "ASCII");
convertInvalidString("\x01\xDC\x02\xD8", "BAD+DC01BAD+D802", "UTF-16LE", "ASCII");
convertInvalidString("\xDD\x11\xD9\x13", "BAD+DD11BAD+D913", "UTF-16BE", "ASCII");
convertInvalidString("\xD8\x01\x00A", "BAD+D801A", "UTF-16", "ASCII");
convertInvalidString("\xD8\x01\x00A", "BAD+D801A", "UTF-16BE", "ASCII");
convertInvalidString("\x01\xD8A\x00", "BAD+D801A", "UTF-16LE", "ASCII");
convertInvalidString("\xD8\x01", "BAD+D801", "UTF-16", "ASCII");
convertInvalidString("\xD8\x01", "BAD+D801", "UTF-16BE", "ASCII");
convertInvalidString("\x01\xD8", "BAD+D801", "UTF-16LE", "ASCII");
convertInvalidString("\x00", "BAD+0", 'UTF-16', 'ASCII');
convertInvalidString("\x00", "BAD+0", 'UTF-16BE', 'ASCII');
convertInvalidString("\x00", "BAD+0", 'UTF-16LE', 'ASCII');
convertInvalidString("\x00A\x01", "ABAD+1", 'UTF-16', 'ASCII');
convertInvalidString("\x00A\x01", "ABAD+1", 'UTF-16BE', 'ASCII');
convertInvalidString("A\x00\x01", "ABAD+1", 'UTF-16LE', 'ASCII');
mb_substitute_character(0x25); // '%'
// TODO: test handling of UTF-16 BOM
echo "== UTF-32 ==\n";
@ -905,6 +966,24 @@ testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
testInvalidString("\x00\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
testInvalidString("\x00", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
mb_substitute_character("long");
convertInvalidString("\x00\x01\x01", "BAD+101", "UTF-32", "ASCII");
convertInvalidString("\x00\x01\x01", "BAD+101", "UTF-32BE", "ASCII");
convertInvalidString("\x01\x01\x00", "BAD+101", "UTF-32LE", "ASCII");
convertInvalidString("\x01", "BAD+1", "UTF-32", "ASCII");
convertInvalidString("\x01", "BAD+1", "UTF-32BE", "ASCII");
convertInvalidString("\x01", "BAD+1", "UTF-32LE", "ASCII");
convertInvalidString("\x00\x11\x00\x00", "BAD+110000", "UTF-32", "ASCII");
convertInvalidString("\x00\x11\x00\x00", "BAD+110000", "UTF-32BE", "ASCII");
convertInvalidString("\x00\x00\x11\x00", "BAD+110000", "UTF-32LE", "ASCII");
convertInvalidString("\x00\x00\xd8\x00", "BAD+D800", "UTF-32", "ASCII");
convertInvalidString("\x00\x00\xd8\x00", "BAD+D800", "UTF-32BE", "ASCII");
convertInvalidString("\x00\xd8\x00\x00", "BAD+D800", "UTF-32LE", "ASCII");
mb_substitute_character(0x25); // '%'
// TODO: test handling of UTF-32 BOM
echo "== UTF-7 ==\n";
@ -1012,6 +1091,28 @@ $encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding.
$corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1);
testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "BAD+DC01BAD+D802", "UTF-7", "UTF-8");
convertInvalidString('+' . rawEncode("\xDC\x01\xD8\x02"), "BAD+DC01BAD+D802", "UTF-7", "UTF-8");
convertInvalidString('+' . rawEncode("\x00\x2E\xDC\x01\xD8\x02") . '-', ".BAD+DC01BAD+D802", "UTF-7", "UTF-8");
convertInvalidString('+' . rawEncode("\x00\x2E\xDC\x01\xD8\x02"), ".BAD+DC01BAD+D802", "UTF-7", "UTF-8");
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xDC\x01\xD8\x02") . '-', "..BAD+DC01BAD+D802", "UTF-7", "UTF-8");
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xDC\x01\xD8\x02"), "..BAD+DC01BAD+D802", "UTF-7", "UTF-8");
convertInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "BAD+D801A", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\x00A") . '-', ".BAD+D801A", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\x00A") . '-', "..BAD+D801A", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\xD8\x01\xD9\x02"), "BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\xD9\x02") . '-', ".BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\xD9\x02"), ".BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\xD9\x02") . '-', "..BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\xD9\x02"), "..BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
convertInvalidString('+' . rawEncode("\x01") . '-', "BAD+100", 'UTF-7', 'UTF-8');
echo "Done!\n";
?>