Test behavior of 'long' illegal character markers

After mb_substitute_character("long"), mbstring will respond to erroneous input by inserting 'long' error markers into the output. Depending on the situation, these error markers will either look like BAD+XXXX (for general bad input), U+XXXX (when the input is OK, but it converts to Unicode codepoints which cannot be represented in the output encoding), or an encoding-specific marker like JISX+XXXX or W932+XXXX. We have almost no tests for this feature. Add a bunch of tests to ensure that all our legacy encoding handlers work in a reasonable way when 'long' error markers are enabled.
2024-11-25 10:54:15 +08:00 · 2021-07-27 13:21:48 +02:00 · 2021-07-27 13:21:48 +02:00 · 51b9d7a5e1
commit 51b9d7a5e1
parent f6f0506c84
41 changed files with 539 additions and 67 deletions
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c
@ -151,9 +151,7 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
 			filter->status = 1;
 			filter->cache = c;
 		} else {
-			w = c & MBFL_WCSGROUP_MASK;
-			w |= MBFL_WCSGROUP_THROUGH;
-			CK((*filter->output_function)(w, filter->data));
+			CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
 		}
 		break;

@ -215,7 +213,7 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
 static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
 {
 	if (filter->status) {
-		(*filter->filter_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter);
+		(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
 	}

 	if (filter->flush_function) {
--- a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c
@ -148,7 +148,7 @@ int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
 			filter->cache = (c1 << 8) + c - 0xA1;
 		} else {
 			filter->status = filter->cache = 0;
-			w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
+			w = 0x8E0000 | ((c1 + 0xA1) << 8) | c | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(w, filter->data));
 		}
 		break;
@ -179,7 +179,7 @@ int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
 			CK((*filter->output_function)(w, filter->data));
 		} else {
 			filter->status = filter->cache = 0;
-			w = (c1 << 8) | c | 0x8e0000 | MBFL_WCSGROUP_THROUGH;
+			w = ((c1 + 0xA1A1) << 8) | c | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(w, filter->data));
 		}
 		break;
--- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c
@ -174,8 +174,16 @@ int mbfl_filt_conv_2022kr_wchar(int c, mbfl_convert_filter *filter)
 static int mbfl_filt_conv_2022kr_wchar_flush(mbfl_convert_filter *filter)
 {
 	if (filter->status & 0xF) {
-		/* 2-byte character or escape sequence was truncated */
-		CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
+		if (filter->status == 2) {
+			CK((*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data));
+		} else if (filter->status == 3) {
+			CK((*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data));
+		} else if (filter->status == 4) {
+			CK((*filter->output_function)(0x1B2429 | MBFL_WCSGROUP_THROUGH, filter->data));
+		} else {
+			/* 2-byte character was truncated */
+			CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
+		}
 	}

 	if (filter->flush_function) {
--- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c
@ -241,7 +241,13 @@ int mbfl_filt_conv_2022jp_mobile_wchar(int c, mbfl_convert_filter *filter)
 static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter)
 {
 	if (filter->status & 0xF) {
-		mbfl_filt_conv_illegal_output(filter->cache, filter);
+		if ((filter->status & 0xF) == 2) {
+			(*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data);
+		} else if ((filter->status & 0xF) == 3) {
+			(*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data);
+		} else {
+			(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
+		}
 	}

 	if (filter->flush_function) {
--- a/ext/mbstring/libmbfl/filters/mbfilter_jis.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_jis.c
@ -155,9 +155,7 @@ retry:
 					w = 0;
 				}
 				if (w <= 0) {
-					w = (c1 << 8) | c;
-					w &= MBFL_WCSPLANE_MASK;
-					w |= MBFL_WCSPLANE_JIS0208;
+					w = (c1 << 8) | c | MBFL_WCSPLANE_JIS0208;
 				}
 			} else {
 				if (s >= 0 && s < jisx0212_ucs_table_size) {
@ -166,16 +164,12 @@ retry:
 					w = 0;
 				}
 				if (w <= 0) {
-					w = (c1 << 8) | c;
-					w &= MBFL_WCSPLANE_MASK;
-					w |= MBFL_WCSPLANE_JIS0212;
+					w = (c1 << 8) | c | MBFL_WCSPLANE_JIS0212;
 				}
 			}
 			CK((*filter->output_function)(w, filter->data));
 		} else {
-			w = (c1 << 8) | c;
-			w &= MBFL_WCSGROUP_MASK;
-			w |= MBFL_WCSGROUP_THROUGH;
+			w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(w, filter->data));
 		}
 		break;
--- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c
@ -188,14 +188,14 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
 				s1 = c1 - 0x80;
 				s2 = c - 0x80;
 			} else {
-				CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
+				CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
 				break;
 			}
 		} else if (filter->from->no_encoding == mbfl_no_encoding_sjis2004) {
 			if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
 				SJIS_DECODE(c1, c, s1, s2);
 			} else {
-				CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
+				CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
 				break;
 			}
 		} else { /* ISO-2022-JP-2004 */
@ -203,7 +203,7 @@ int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
 				s1 = c1;
 				s2 = c;
 			} else {
-				CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
+				CK((*filter->output_function)((c1 << 8) | c | MBFL_WCSGROUP_THROUGH, filter->data));
 				break;
 			}
 		}
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@ -195,7 +195,7 @@ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
 		n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
 		if (n >= 0xD800 && n <= 0xDBFF) {
 			/* Wrong; that's the first half of a surrogate pair, not the second */
-			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(0xD800 | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
 			filter->cache = n & 0x3FF;
 			filter->status = 2;
 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
@ -203,7 +203,7 @@ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
 			CK((*filter->output_function)(n, filter->data));
 			filter->status = 0;
 		} else {
-			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(0xD800 | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
 			CK((*filter->output_function)(n, filter->data));
 			filter->status = 0;
 		}
@ -269,7 +269,7 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 	case 3:
 		n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
 		if (n >= 0xD800 && n <= 0xDBFF) {
-			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(0xD800 | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
 			filter->cache = n & 0x3FF;
 			filter->status = 2;
 		} else if (n >= 0xDC00 && n <= 0xDFFF) {
@ -277,7 +277,7 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 			CK((*filter->output_function)(n, filter->data));
 			filter->status = 0;
 		} else {
-			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(0xD800 | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
 			CK((*filter->output_function)(n, filter->data));
 			filter->status = 0;
 		}
@ -316,7 +316,11 @@ static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)

 	if (status) {
 		/* Input string was truncated */
-		CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
+		if (status == 1) {
+			CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
+		} else if (status == 2) {
+			CK((*filter->output_function)(0xD800 | cache | MBFL_WCSGROUP_THROUGH, filter->data));
+		}
 	}

 	if (filter->flush_function) {
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c
@ -107,7 +107,11 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
 			if (filter->cache) {
 				/* Either we were expecting the 2nd half of a surrogate pair which
 				 * never came, or else the last Base64 data was not padded with zeroes */
-				(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
+				if (filter->cache & 0xfff0000) {
+					(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
+				} else {
+					(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
+				}
 			}
 			if (c == '-') {
 				if (filter->status == 1) { /* "+-" -> "+" */
@ -150,25 +154,32 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
 		n = (n & 0x3) << 14;
 		filter->status = 5;
 		if (s >= 0xd800 && s < 0xdc00) {
+			/* 1st part of surrogate pair */
 			if (filter->cache & 0xfff0000) {
 				/* We were waiting for the 2nd part of a surrogate pair */
-				(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
+				(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
 			}
 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
 			filter->cache = s;
 		} else if (s >= 0xdc00 && s < 0xe000) {
-			s &= 0x3ff;
-			s |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = n;
-			if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(s, filter->data));
-			} else { /* illegal character */
+			/* 2nd part of surrogate pair */
+			if (filter->cache & 0xfff0000) {
+				s &= 0x3ff;
+				s |= (filter->cache & 0xfff0000) >> 6;
+				filter->cache = n;
+				if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
+					CK((*filter->output_function)(s, filter->data));
+				} else { /* illegal character */
+					CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
+				}
+			} else {
 				CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
+				filter->cache = n;
 			}
 		} else {
 			if (filter->cache & 0xfff0000) {
 				/* We were waiting for the 2nd part of a surrogate pair */
-				(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
+				(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
 			}
 			filter->cache = n;
 			CK((*filter->output_function)(s, filter->data));
@ -190,23 +201,29 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
 		if (s >= 0xd800 && s < 0xdc00) {
 			if (filter->cache & 0xfff0000) {
 				/* We were waiting for the 2nd part of a surrogate pair */
-				(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
+				(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
 			}
 			s = (((s & 0x3ff) << 16) + 0x400000) | n;
 			filter->cache = s;
 		} else if (s >= 0xdc00 && s < 0xe000) {
-			s &= 0x3ff;
-			s |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = n;
-			if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(s, filter->data));
-			} else { /* illegal character */
+			/* 2nd part of surrogate pair */
+			if (filter->cache & 0xfff0000) {
+				s &= 0x3ff;
+				s |= (filter->cache & 0xfff0000) >> 6;
+				filter->cache = n;
+				if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
+					CK((*filter->output_function)(s, filter->data));
+				} else { /* illegal character */
+					CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
+				}
+			} else {
 				CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
+				filter->cache = n;
 			}
 		} else {
 			if (filter->cache & 0xfff0000) {
 				/* We were waiting for the 2nd part of a surrogate pair */
-				(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
+				(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
 			}
 			filter->cache = n;
 			CK((*filter->output_function)(s, filter->data));
@ -223,25 +240,28 @@ int mbfl_filt_conv_utf7_wchar(int c, mbfl_convert_filter *filter)
 		if (s >= 0xd800 && s < 0xdc00) {
 			if (filter->cache & 0xfff0000) {
 				/* We were waiting for the 2nd part of a surrogate pair */
-				(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
+				(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
 			}
 			s = (((s & 0x3ff) << 16) + 0x400000);
 			filter->cache = s;
 		} else if (s >= 0xdc00 && s < 0xe000) {
-			s &= 0x3ff;
-			s |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = 0;
-			if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(s, filter->data));
-			} else {		/* illegal character */
-				s &= MBFL_WCSGROUP_MASK;
-				s |= MBFL_WCSGROUP_THROUGH;
-				CK((*filter->output_function)(s, filter->data));
+			if (filter->cache & 0xfff0000) {
+				s &= 0x3ff;
+				s |= (filter->cache & 0xfff0000) >> 6;
+				filter->cache = 0;
+				if (s >= MBFL_WCSPLANE_SUPMIN && s < MBFL_WCSPLANE_SUPMAX) {
+					CK((*filter->output_function)(s, filter->data));
+				} else { /* illegal character */
+					CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
+				}
+			} else {
+				CK((*filter->output_function)(s | MBFL_WCSGROUP_THROUGH, filter->data));
+				filter->cache = 0;
 			}
 		} else {
 			if (filter->cache & 0xfff0000) {
 				/* We were waiting for the 2nd part of a surrogate pair */
-				(*filter->output_function)(((filter->cache & 0xfff0000) >> 6) | MBFL_WCSGROUP_THROUGH, filter->data);
+				(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
 			}
 			filter->cache = 0;
 			CK((*filter->output_function)(s, filter->data));
@ -261,7 +281,11 @@ static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter)
 	if (filter->cache) {
 		/* Either we were expecting the 2nd half of a surrogate pair which
 		 * never came, or else the last Base64 data was not padded with zeroes */
-		(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
+		if (filter->cache & 0xfff0000) {
+			(*filter->output_function)(0xD800 | (((filter->cache - 0x400000) >> 16) & 0x3FF) | MBFL_WCSGROUP_THROUGH, filter->data);
+		} else {
+			(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
+		}
 	}

 	if (filter->flush_function) {
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
@ -96,7 +96,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 	int s, c1;

 retry:
-	switch (filter->status & 0xff) {
+	switch (filter->status) {
 	case 0x00:
 		if (c < 0x80) {
 			CK((*filter->output_function)(c, filter->data));
@ -116,15 +116,31 @@ retry:
 	case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
 	case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
 	case 0x32: /* 4byte code 4th char: 0x80-0xbf */
-		filter->status = 0;
 		if (c >= 0x80 && c <= 0xbf) {
 			s = (filter->cache<<6) | (c & 0x3f);
-			filter->cache = 0;
+			filter->status = filter->cache = 0;
 			CK((*filter->output_function)(s, filter->data));
 		} else {
-			CK(mbfl_filt_put_invalid_char(filter->cache, filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
+			int status = filter->status;
+			filter->status = 0;
+			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
+				if (status == 0x10) {
+					CK(mbfl_filt_put_invalid_char(0xC0 | filter->cache, filter));
+				} else if (status == 0x21) {
+					CK(mbfl_filt_put_invalid_char(0xE080 | ((filter->cache & ~0x3F) << 2) | (filter->cache & 0x3F), filter));
+				} else {
+					CK(mbfl_filt_put_invalid_char(0xF08080 | ((filter->cache & ~0xFFF) << 4) | ((filter->cache & 0xFC0) << 2) | (filter->cache & 0x3F), filter));
+				}
 				goto retry;
+			} else {
+				if (status == 0x10) {
+					CK(mbfl_filt_put_invalid_char(0xC000 | (filter->cache << 8) | c, filter));
+				} else if (status == 0x21) {
+					CK(mbfl_filt_put_invalid_char(0xE08000 | ((filter->cache & ~0x3F) << 10) | ((filter->cache & 0x3F) << 8) | c, filter));
+				} else {
+				CK(mbfl_filt_put_invalid_char(0x808000 | ((filter->cache & 0xFC0) << 10) | ((filter->cache & 0x3F) << 8) | c, filter));
+				}
+			}
 		}
 		break;
 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@ -138,9 +154,13 @@ retry:
 			filter->cache = s;
 			filter->status++;
 		} else {
-			CK(mbfl_filt_put_invalid_char(filter->cache, filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
+			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
+				CK(mbfl_filt_put_invalid_char(0xE0 | filter->cache, filter));
 				goto retry;
+			} else {
+				CK(mbfl_filt_put_invalid_char(0xE000 | (filter->cache << 8) | c, filter));
+				filter->status = 0;
+			}
 		}
 		break;
 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@ -154,9 +174,13 @@ retry:
 			filter->cache = s;
 			filter->status++;
 		} else {
-			CK(mbfl_filt_put_invalid_char(filter->cache, filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
+			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
+				CK(mbfl_filt_put_invalid_char(0xF0 | filter->cache, filter));
 				goto retry;
+			} else {
+				CK(mbfl_filt_put_invalid_char(0xF000 | (filter->cache << 8) | c, filter));
+				filter->status = 0;
+			}
 		}
 		break;
 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@ -164,9 +188,13 @@ retry:
 			filter->cache = (filter->cache<<6) | (c & 0x3f);
 			filter->status++;
 		} else {
-			CK(mbfl_filt_put_invalid_char(filter->cache, filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
+			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
+				CK(mbfl_filt_put_invalid_char(0xF080 | ((filter->cache & ~0x3F) << 2) | (filter->cache & 0x3F), filter));
 				goto retry;
+			} else {
+				CK(mbfl_filt_put_invalid_char(0xF000 | (filter->cache << 8) | c, filter));
+				filter->status = 0;
+			}
 		}
 		break;
 	default:
@ -184,7 +212,19 @@ int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
 	filter->status = filter->cache = 0;

 	if (status) {
-		CK(mbfl_filt_put_invalid_char(cache, filter));
+		if (status == 0x10) {
+			CK(mbfl_filt_put_invalid_char(0xC0 | cache, filter));
+		} else if (status == 0x20) {
+			CK(mbfl_filt_put_invalid_char(0xE0 | cache, filter));
+		} else if (status == 0x21) {
+			CK(mbfl_filt_put_invalid_char(0xE080 | ((cache & ~0x3F) << 2) | (cache & 0x3F), filter));
+		} else if (status == 0x30) {
+			CK(mbfl_filt_put_invalid_char(0xF0 | cache, filter));
+		} else if (status == 0x31) {
+			CK(mbfl_filt_put_invalid_char(0xF080 | ((cache & ~0x3F) << 2) | (cache & 0x3F), filter));
+		} else if (status == 0x32) {
+			CK(mbfl_filt_put_invalid_char(0xF08080 | ((cache & ~0xFFF) << 4) | ((cache & 0xFC0) << 2) | (cache & 0x3F), filter));
+		}
 	}

 	if (filter->flush_function) {
--- a/ext/mbstring/tests/armscii8_encoding.phpt
+++ b/ext/mbstring/tests/armscii8_encoding.phpt
@ -27,7 +27,15 @@ echo "Tested ARMSCII-8 -> UTF-16BE\n";
 findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'ARMSCII-8', '%');
 echo "Tested UTF-16BE -> ARMSCII-8\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xA1", "BAD+A1", "ARMSCII-8", "UTF-8");
+convertInvalidString("\xFF", "BAD+FF", "ARMSCII-8", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested ARMSCII-8 -> UTF-16BE
 Tested UTF-16BE -> ARMSCII-8
+Done!
--- a/ext/mbstring/tests/big5_encoding.phpt
+++ b/ext/mbstring/tests/big5_encoding.phpt
@ -33,7 +33,17 @@ testAllValidChars($fromUnicode, 'UTF-16BE', 'BIG5', false);
 findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'BIG5', '%');
 echo "Tested UTF-16BE -> BIG5\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "BIG5", "UTF-8");
+convertInvalidString("\xB0\x9F", "BAD+B09F", "BIG5", "UTF-8");
+convertInvalidString("\xA3\xED", "?+A3ED", "BIG5", "UTF-8");
+convertInvalidString("\x76\x54", "U+7654", "UTF-16BE", "BIG5");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested BIG5 -> UTF-16BE
 Tested UTF-16BE -> BIG5
+Done!
--- a/ext/mbstring/tests/cp1251_encoding.phpt
+++ b/ext/mbstring/tests/cp1251_encoding.phpt
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 <?php
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1251.txt', 'CP1251');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x98", "BAD+98", "CP1251", "UTF-8");
+convertInvalidString("\x12\x34", "U+1234", "UTF-16BE", "CP1251");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested CP1251 -> UTF-16BE
 Tested UTF-16BE -> CP1251
+Done!
--- a/ext/mbstring/tests/cp1252_encoding.phpt
+++ b/ext/mbstring/tests/cp1252_encoding.phpt
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 <?php
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x81", "BAD+81", "CP1252", "UTF-8");
+convertInvalidString("\x9D", "BAD+9D", "CP1252", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested CP1252 -> UTF-16BE
 Tested UTF-16BE -> CP1252
+Done!
--- a/ext/mbstring/tests/cp1254_encoding.phpt
+++ b/ext/mbstring/tests/cp1254_encoding.phpt
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 <?php
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1254.txt', 'CP1254');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x81", "BAD+81", "CP1254", "UTF-8");
+convertInvalidString("\x9E", "BAD+9E", "CP1254", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested CP1254 -> UTF-16BE
 Tested UTF-16BE -> CP1254
+Done!
--- a/ext/mbstring/tests/cp5022x_encoding.phpt
+++ b/ext/mbstring/tests/cp5022x_encoding.phpt
@ -292,6 +292,16 @@ testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222');

 echo "Invalid Unicode is flagged when converting to CP5022x\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "CP50220", "UTF-8");
+convertInvalidString("\x80", "BAD+80", "CP50221", "UTF-8");
+convertInvalidString("\x80", "BAD+80", "CP50222", "UTF-8");
+convertInvalidString("\x1B\$B1", "BAD+31", "CP50220", "UTF-8");
+convertInvalidString("\x1B\$B1", "BAD+31", "CP50221", "UTF-8");
+convertInvalidString("\x1B\$B1", "BAD+31", "CP50222", "UTF-8");
+
+echo "Long error markers OK\n";
 ?>
 --EXPECT--
 ASCII support OK
@ -299,3 +309,4 @@ JIS X 0201 support OK
 CP932 support OK
 Folding of fullwidth katakana for CP50220 OK
 Invalid Unicode is flagged when converting to CP5022x
+Long error markers OK
--- a/ext/mbstring/tests/cp51932_encoding.phpt
+++ b/ext/mbstring/tests/cp51932_encoding.phpt
@ -108,8 +108,16 @@ echo "CP51932 verification and conversion works on all invalid characters\n";
 findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
 convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
 echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "CP51932", "UTF-8");
+convertInvalidString("\xFE\xFF", "BAD+FEFF", "CP51932", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 CP51932 verification and conversion works on all valid characters
 CP51932 verification and conversion works on all invalid characters
 Unicode -> CP51932 conversion works on all invalid codepoints
+Done!
--- a/ext/mbstring/tests/cp932_encoding.phpt
+++ b/ext/mbstring/tests/cp932_encoding.phpt
@ -105,8 +105,18 @@ echo "CP932 verification and conversion works on all invalid characters\n";

 convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
 echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "CP932", "UTF-8");
+convertInvalidString("\xEA", "BAD+EA", "CP932", "UTF-8");
+convertInvalidString("\x81\x20", "BAD+8120", "CP932", "UTF-8");
+convertInvalidString("\xEA\xA9", "W932+742B", "CP932", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 CP932 verification and conversion works on all valid characters
 CP932 verification and conversion works on all invalid characters
 Unicode -> CP932 conversion works on all invalid codepoints
+Done!
--- a/ext/mbstring/tests/cp936_encoding.phpt
+++ b/ext/mbstring/tests/cp936_encoding.phpt
@ -324,7 +324,15 @@ findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF),
 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP936', '%');
 echo "Tested UTF-16BE -> CP936\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x81\x20", "BAD+8120", "CP936", "UTF-8");
+convertInvalidString("\x81\x7F", "BAD+817F", "CP936", "UTF-8");
+convertInvalidString("\xFE\xFF", "BAD+FEFF", "CP936", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested CP936 -> UTF-16BE
 Tested UTF-16BE -> CP936
+Done!
--- a/ext/mbstring/tests/cp950_encoding.phpt
+++ b/ext/mbstring/tests/cp950_encoding.phpt
@ -75,7 +75,15 @@ testAllValidChars($fromUnicode, 'UTF-16BE', 'CP950', false);
 findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'CP950', '%');
 echo "Tested UTF-16BE -> CP950\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "CP950", "UTF-8");
+convertInvalidString("\x26\x09", "U+2609", "UTF-16BE", "CP950");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested CP950 -> UTF-16BE
 Tested UTF-16BE -> CP950
+Done!
--- a/ext/mbstring/tests/euc_cn_encoding.phpt
+++ b/ext/mbstring/tests/euc_cn_encoding.phpt
@ -10,7 +10,16 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 <?php
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/EUC-CN.txt', 'EUC-CN');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "EUC-CN", "UTF-8");
+convertInvalidString("\xA1\x50", "BAD+A150", "EUC-CN", "UTF-8");
+convertInvalidString("\xF7\xFF", "BAD+F7FF", "EUC-CN", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested EUC-CN -> UTF-16BE
 Tested UTF-16BE -> EUC-CN
+Done!
--- a/ext/mbstring/tests/euc_kr_encoding.phpt
+++ b/ext/mbstring/tests/euc_kr_encoding.phpt
@ -10,7 +10,15 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 <?php
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/EUC-KR.txt', 'EUC-KR');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "EUC-KR", "UTF-8");
+convertInvalidString("\xA7\xF0", "?+A7F0", "EUC-KR", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested EUC-KR -> UTF-16BE
 Tested UTF-16BE -> EUC-KR
+Done!
--- a/ext/mbstring/tests/euc_tw_encoding.phpt
+++ b/ext/mbstring/tests/euc_tw_encoding.phpt
@ -25,7 +25,20 @@ findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF),
 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', 'EUC-TW', '%');
 echo "Tested UTF-16BE -> EUC-TW\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "EUC-TW", "UTF-8");
+convertInvalidString("\x8E\x20", "BAD+8E20", "EUC-TW", "UTF-8");
+convertInvalidString("\x8E\xA1\x20", "BAD+8EA120", "EUC-TW", "UTF-8");
+convertInvalidString("\x8E\xA1\xA1\x20", "BAD+A1A120", "EUC-TW", "UTF-8");
+convertInvalidString("\x8E\xA2\xA3\x20", "BAD+A2A320", "EUC-TW", "UTF-8");
+convertInvalidString("\x8F", "BAD+8F", "EUC-TW", "UTF-8");
+convertInvalidString("\xA1\x50", "BAD+A150", "EUC-TW", "UTF-8");
+convertInvalidString("\xFD\xCC", "?+FDCC", "EUC-TW", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested EUC-TW -> UTF-16BE
 Tested UTF-16BE -> EUC-TW
+Done!
--- a/ext/mbstring/tests/eucjp_2004_encoding.phpt
+++ b/ext/mbstring/tests/eucjp_2004_encoding.phpt
@ -69,9 +69,17 @@ echo "Unicode -> EUC-JP-2004 conversion works on all valid characters\n";
 findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
 convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'EUC-JP-2004', '%');
 echo "Unicode -> EUC-JP-2004 conversion works on all invalid characters\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "EUC-JP-2004", "UTF-8");
+convertInvalidString("\xFE\xFF", "BAD+FEFF", "EUC-JP-2004", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 EUC-JP-2004 verification and conversion works for all valid characters
 EUC-JP-2004 verification and conversion rejects all invalid characters
 Unicode -> EUC-JP-2004 conversion works on all valid characters
 Unicode -> EUC-JP-2004 conversion works on all invalid characters
+Done!
--- a/ext/mbstring/tests/eucjp_encoding.phpt
+++ b/ext/mbstring/tests/eucjp_encoding.phpt
@ -76,6 +76,13 @@ for ($cp = 0; $cp <= 0xFFFF; $cp++) {
 }
 convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-32BE', 'EUC-JP', '%');
 echo "Unicode -> EUC-JP conversion works on all invalid characters\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "EUC-JP", "UTF-8");
+convertInvalidString("\xFE\xFF", "BAD+FEFF", "EUC-JP", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Encoding verification and conversion work for all valid characters
@ -83,3 +90,4 @@ Encoding verification and conversion work for all invalid characters
 Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly
 Unicode -> EUC-JP conversion works on all valid characters
 Unicode -> EUC-JP conversion works on all invalid characters
+Done!
--- a/ext/mbstring/tests/gb18030_encoding.phpt
+++ b/ext/mbstring/tests/gb18030_encoding.phpt
@ -300,8 +300,15 @@ echo "Tested GB18030 4-byte characters <-> UTF-16BE\n";
 testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false);
 echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x81\x30\x81\xFF", "BAD+3081FF", "GB18030", "UTF-8");
+convertInvalidString("\xE3\x32\x9A\x36", "BAD+329A36", "GB18030", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested GB18030 (1 and 2 byte characters) -> UTF-16BE
 Tested GB18030 4-byte characters <-> UTF-16BE
 Tested UTF-16BE -> GB18030 (1 and 2 byte characters)
+Done!
--- a/ext/mbstring/tests/hz_encoding.phpt
+++ b/ext/mbstring/tests/hz_encoding.phpt
@ -118,6 +118,13 @@ while (!empty($badChars)) {

 echo "Tested UTF-16BE -> HZ (for all GB2312 characters)\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("~A", "BAD+41", "HZ", "UTF-8");
+convertInvalidString("\x80", "BAD+80", "HZ", "UTF-8");
+convertInvalidString("~{\x22\x21", "?+2221", "HZ", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested ASCII -> HZ
@ -127,3 +134,4 @@ Tested valid ~ escapes
 Tested all invalid ~ escapes
 Tested HZ -> UTF-16BE (for all GB2312 characters)
 Tested UTF-16BE -> HZ (for all GB2312 characters)
+Done!
--- a/ext/mbstring/tests/iso2022jp_2004_encoding.phpt
+++ b/ext/mbstring/tests/iso2022jp_2004_encoding.phpt
@ -314,6 +314,12 @@ for ($i = 0; $i < 100; $i++) {
 	testValid($testString, $convertsTo, false);
 }

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-2004", "UTF-8");
+convertInvalidString("\x1B\$(X", "BAD+1B2428", "ISO-2022-JP-2004", "UTF-8"); // Invalid escape
+convertInvalidString("\x1B\$B!", "BAD+21", "ISO-2022-JP-2004", "UTF-8"); // Truncated character
+
 echo "All done!\n";

 ?>
--- a/ext/mbstring/tests/iso2022jp_encoding.phpt
+++ b/ext/mbstring/tests/iso2022jp_encoding.phpt
@ -187,6 +187,16 @@ for ($i = 0; $i <= 0xFF; $i++) {

 echo "All escape sequences work as expected\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xE0", "BAD+E0", "JIS", "UTF-8");
+convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP", "UTF-8");
+convertInvalidString("\x1B\$(X", "BAD+1B\$(X", "JIS", "UTF-8"); // Invalid escape
+convertInvalidString("\x1B\$(X", "BAD+1B\$(X", "ISO-2022-JP", "UTF-8"); // Invalid escape
+convertInvalidString("\x1B\$B!", "BAD+21", "JIS", "UTF-8"); // Truncated character
+convertInvalidString("\x1B\$B!", "BAD+21", "ISO-2022-JP", "UTF-8"); // Truncated character
+
+echo "Done!\n";
 ?>
 --EXPECT--
 ASCII support OK
@ -194,3 +204,4 @@ JIS X 0201 support OK
 JIS X 0208 support OK
 JIS X 0212 support OK
 All escape sequences work as expected
+Done!
--- a/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt
+++ b/ext/mbstring/tests/iso2022jp_kddi_encoding.phpt
@ -199,8 +199,21 @@ foreach (array_keys($truncatedChars) as $truncated)

 echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-KDDI", "UTF-8");
+// Invalid escapes:
+convertInvalidString("\x1B", "BAD+1B", "ISO-2022-JP-KDDI", "UTF-8");
+convertInvalidString("\x1B.", "BAD+1B2E", "ISO-2022-JP-KDDI", "UTF-8");
+convertInvalidString("\x1B\$", "BAD+1B24", "ISO-2022-JP-KDDI", "UTF-8");
+convertInvalidString("\x1B\$.", "BAD+1B242E", "ISO-2022-JP-KDDI", "UTF-8");
+convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-KDDI", "UTF-8");
+convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character
+
+echo "Done!\n";
 ?>
 --EXPECT--
 ASCII support OK
 JIS X 0201 support OK
 JIS X 0208 (with MS extensions) and KDDI emoji support OK
+Done!
--- a/ext/mbstring/tests/iso2022jp_ms_encoding.phpt
+++ b/ext/mbstring/tests/iso2022jp_ms_encoding.phpt
@ -198,9 +198,17 @@ foreach (array_keys($truncatedChars) as $truncated)

 echo "UDC support OK\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-MS", "UTF-8");
+convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8"); // Invalid escape
+convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character
+
+echo "Done!\n";
 ?>
 --EXPECT--
 ASCII support OK
 JIS X 0201 support OK
 JIS X 0208 (with MS extensions) support OK
 UDC support OK
+Done!
--- a/ext/mbstring/tests/iso2022kr_encoding.phpt
+++ b/ext/mbstring/tests/iso2022kr_encoding.phpt
@ -96,9 +96,18 @@ testValid("\x0E\x0E\x0F\x0E\x0Fabc", "\x00a\x00b\x00c", false);

 echo "Escapes behave as expected\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x1B", "BAD+1B", "ISO-2022-KR", "UTF-8");
+convertInvalidString("\x1B$", "BAD+1B24", "ISO-2022-KR", "UTF-8");
+convertInvalidString("\x1B$)", "BAD+1B2429", "ISO-2022-KR", "UTF-8");
+convertInvalidString("\x1B$)C\x0E\x7C\x84", "BAD+7C84", "ISO-2022-KR", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Empty string OK
 ASCII support OK
 KS X 1001 support OK
 Escapes behave as expected
+Done!
--- a/ext/mbstring/tests/iso8859_encodings.phpt
+++ b/ext/mbstring/tests/iso8859_encodings.phpt
@ -15,6 +15,13 @@ for ($n = 1; $n <= 16; $n++) {
        continue;
    testEncodingFromUTF16ConversionTable(__DIR__ . "/data/8859-$n.txt", "ISO-8859-{$n}");
 }
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xAE", "BAD+AE", "ISO8859-7", "UTF-8");
+convertInvalidString("\xFF", "BAD+FF", "ISO8859-8", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Tested ISO-8859-1 -> UTF-16BE
@ -45,3 +52,4 @@ Tested ISO-8859-15 -> UTF-16BE
 Tested UTF-16BE -> ISO-8859-15
 Tested ISO-8859-16 -> UTF-16BE
 Tested UTF-16BE -> ISO-8859-16
+Done!
--- a/ext/mbstring/tests/sjis2004_encoding.phpt
+++ b/ext/mbstring/tests/sjis2004_encoding.phpt
@ -60,9 +60,18 @@ echo "Unicode -> SJIS-2004 conversion works on all valid characters\n";
 findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
 convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-2004', '%');
 echo "Unicode -> SJIS-2004 conversion works on all invalid characters\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "SJIS-2004", "UTF-8");
+convertInvalidString("\x81\x20", "BAD+8120", "SJIS-2004", "UTF-8");
+convertInvalidString("\xFC\xF5", "BAD+FCF5", "SJIS-2004", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 SJIS-2004 verification and conversion works for all valid characters
 SJIS-2004 verification and conversion rejects all invalid characters
 Unicode -> SJIS-2004 conversion works on all valid characters
 Unicode -> SJIS-2004 conversion works on all invalid characters
+Done!
--- a/ext/mbstring/tests/sjis_encoding.phpt
+++ b/ext/mbstring/tests/sjis_encoding.phpt
@ -59,9 +59,18 @@ echo "Unicode -> SJIS conversion works on all valid characters\n";
 findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
 convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%');
 echo "Unicode -> SJIS conversion works on all invalid characters\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "Shift-JIS", "UTF-8");
+convertInvalidString("\x81\x20", "BAD+8120", "Shift-JIS", "UTF-8");
+convertInvalidString("\xEA\xA9", "JIS+742B", "Shift-JIS", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 SJIS verification and conversion works on all valid characters
 SJIS verification and conversion works on all invalid characters
 Unicode -> SJIS conversion works on all valid characters
 Unicode -> SJIS conversion works on all invalid characters
+Done!
--- a/ext/mbstring/tests/sjis_mobile_encodings.phpt
+++ b/ext/mbstring/tests/sjis_mobile_encodings.phpt
@ -277,6 +277,13 @@ function testSJISVariant($validChars, $nonInvertible, $encoding) {

  convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%');
  echo "Unicode -> $encoding conversion works on all invalid codepoints\n";
+
+  // Test "long" illegal character markers
+  mb_substitute_character("long");
+  convertInvalidString("\x80", "BAD+80", $encoding, "UTF-8");
+  convertInvalidString("\x81\x20", "BAD+8120", $encoding, "UTF-8");
+  convertInvalidString("\xEA\xA9", "W932+742B", $encoding, "UTF-8");
+  mb_substitute_character(0x25); // '%'
 }

 testSJISVariant($docomo,   $nonInvertibleDocomo,   'SJIS-Mobile#DOCOMO');
--- a/ext/mbstring/tests/sjismac_encoding.phpt
+++ b/ext/mbstring/tests/sjismac_encoding.phpt
@ -86,9 +86,18 @@ echo "Unicode -> SJIS-mac conversion works on all valid characters\n";
 findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2));
 convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-mac', '%');
 echo "Unicode -> SJIS-mac conversion works on all invalid characters\n";
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x81", "BAD+81", "SJIS-mac", "UTF-8");
+convertInvalidString("\x81\x20", "BAD+8120", "SJIS-mac", "UTF-8");
+convertInvalidString("\xED\x9F", "W932+7A21", "SJIS-mac", "UTF-8");
+
+echo "Done!\n";
 ?>
 --EXPECT--
 MacJapanese verification and conversion works on all valid characters
 MacJapanese verification and conversion rejects all invalid characters
 Unicode -> SJIS-mac conversion works on all valid characters
 Unicode -> SJIS-mac conversion works on all invalid characters
+Done!
--- a/ext/mbstring/tests/ucs2_encoding.phpt
+++ b/ext/mbstring/tests/ucs2_encoding.phpt
@ -0,0 +1,21 @@
+--TEST--
+Test verification and conversion of UCS-2 text
+--EXTENSIONS--
+mbstring
+--FILE--
+<?php
+include('encoding_tests.inc');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+
+convertInvalidString("\x00\x01\x02\x03", "\x00U\x00+\x001\x000\x002\x000\x003", "UTF-32BE", "UCS-2BE");
+convertInvalidString("\x11", "BAD+1100", "UCS-2BE", "UTF-8");
+
+convertInvalidString("\x00\x01\x02\x03", "U\x00+\x001\x000\x002\x000\x003\x00", "UTF-32BE", "UCS-2LE");
+convertInvalidString("\x11", "BAD+11", "UCS-2LE", "UTF-8");
+
+echo "Done!";
+?>
+--EXPECT--
+Done!
--- a/ext/mbstring/tests/ucs4_encoding.phpt
+++ b/ext/mbstring/tests/ucs4_encoding.phpt
@ -0,0 +1,25 @@
+--TEST--
+Test verification and conversion of UCS-4 text
+--EXTENSIONS--
+mbstring
+--FILE--
+<?php
+include('encoding_tests.inc');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x6F\x00\x00\x00", "U+6F000000", "UCS-4BE", "UTF-8");
+convertInvalidString("\x70\x00\x00\x00", "?+0", "UCS-4BE", "UTF-8");
+convertInvalidString("\x78\x00\x00\x01", "BAD+1", "UCS-4BE", "UTF-8");
+convertInvalidString("\x80\x01\x02\x03", "BAD+10203", "UCS-4BE", "UTF-8");
+convertInvalidString("\x00\x01\x02", "BAD+10200", "UCS-4BE", "UTF-8");
+
+convertInvalidString("\x00\x00\x00\x6F", "U+6F000000", "UCS-4LE", "UTF-8");
+convertInvalidString("\x00\x00\x00\x70", "?+0", "UCS-4LE", "UTF-8");
+convertInvalidString("\x01\x00\x00\x78", "BAD+1", "UCS-4LE", "UTF-8");
+convertInvalidString("\x02\x01\x00", "BAD+102", "UCS-4LE", "UTF-8");
+
+echo "Done!";
+?>
+--EXPECT--
+Done!
--- a/ext/mbstring/tests/uhc_encoding.phpt
+++ b/ext/mbstring/tests/uhc_encoding.phpt
@ -10,6 +10,11 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
 <?php
 include('encoding_tests.inc');
 testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP949.txt', 'UHC');
+
+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x80", "BAD+80", "UHC", "UTF-8");
+convertInvalidString("\xA7\xF0", "?+A7F0", "UHC", "UTF-8");
 ?>
 --EXPECT--
 Tested UHC -> UTF-16BE
--- a/ext/mbstring/tests/utf7imap_encoding.phpt
+++ b/ext/mbstring/tests/utf7imap_encoding.phpt
@ -192,6 +192,14 @@ testValid("123&" . mBase64(utf16BE("１２３")) . "-abc&" . mBase64(utf16BE("

 echo "Identification and conversion of valid text is working... perfect!\n";

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\x10", "BAD+10", "UTF7-IMAP", "UTF-8");
+convertInvalidString("\x80", "BAD+80", "UTF7-IMAP", "UTF-8");
+convertInvalidString("abc&", "abcBAD+0", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it, so the 'bad character' is 'zero'
+convertInvalidString("&**-", "BAD+2A*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
+
+echo "Done!\n";
 ?>
 --EXPECT--
 Identification passes on empty string... good start!
@ -204,3 +212,4 @@ Testing valid strings which use '&-' for '&'... good!
 Identification fails when Base64 sections contain non-Base64 bytes... right!
 Identification fails when UTF-16 text is invalid... no sweat!
 Identification and conversion of valid text is working... perfect!
+Done!
--- a/ext/mbstring/tests/utf_encodings.phpt
+++ b/ext/mbstring/tests/utf_encodings.phpt
@ -799,6 +799,44 @@ $invalid = array(

 testInvalidCodepoints($invalid, 'UTF-8');

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xF4\x90\x80\x80", "BAD+F490BAD+80BAD+80", "UTF-8", "ASCII");
+convertInvalidString("\xF7\x80\x80\x80", "BAD+F7BAD+80BAD+80BAD+80", "UTF-8", "ASCII");
+convertInvalidString("\xED\xA0\x80", "BAD+EDA0BAD+80", "UTF-8", "ASCII");
+convertInvalidString("\xED\xBF\xBF", "BAD+EDBFBAD+BF", "UTF-8", "ASCII");
+// Truncated:
+convertInvalidString("\xDF", "BAD+DF", "UTF-8", "ASCII");
+convertInvalidString("\xEF", "BAD+EF", "UTF-8", "ASCII");
+convertInvalidString("\xEF\xBF", "BAD+EFBF", "UTF-8", "ASCII");
+convertInvalidString("\xF0", "BAD+F0", "UTF-8", "ASCII");
+convertInvalidString("\xF0\xBF", "BAD+F0BF", "UTF-8", "ASCII");
+convertInvalidString("\xF0\xBF\xBF", "BAD+F0BFBF", "UTF-8", "ASCII");
+// Multi-byte character ends too early and goes back to ASCII:
+convertInvalidString("\xDFA", "BAD+DFA", "UTF-8", "ASCII");
+convertInvalidString("\xEFA", "BAD+EFA", "UTF-8", "ASCII");
+convertInvalidString("\xEF\xBFA", "BAD+EFBFA", "UTF-8", "ASCII");
+convertInvalidString("\xF0A", "BAD+F0A", "UTF-8", "ASCII");
+convertInvalidString("\xF0\xBFA", "BAD+F0BFA", "UTF-8", "ASCII");
+convertInvalidString("\xF0\xBF\xBFA", "BAD+F0BFBFA", "UTF-8", "ASCII");
+// Multi-byte character ends too early and goes to a byte which is not ASCII, nor could
+// it possibly start a valid multi-byte character
+convertInvalidString("\xEF\xBF\xC0", "BAD+EFBFC0", "UTF-8", "ASCII");
+convertInvalidString("\xF0\xBF\xBF\xC0", "BAD+BFBFC0", "UTF-8", "ASCII");
+
+convertInvalidString("\xDF\xDF\xBF", "BAD+DFU+7FF", "UTF-8", "ASCII");
+convertInvalidString("\xEF\xBF\xDF\xBF", "BAD+EFBFU+7FF", "UTF-8", "ASCII");
+convertInvalidString("\xF0\xBF\xBF\xDF\xBF", "BAD+F0BFBFU+7FF", "UTF-8", "ASCII");
+
+convertInvalidString("\x80", "BAD+80", "UTF-8", "ASCII");
+convertInvalidString(".\x80", ".BAD+80", "UTF-8", "ASCII");
+convertInvalidString("\xDF\xBF\x80", "U+7FFBAD+80", "UTF-8", "ASCII");
+
+convertInvalidString("\xC1\xBF", "BAD+C1BAD+BF", "UTF-8", "ASCII");
+convertInvalidString("\xE0\x9F\xBF", "BAD+E09FBAD+BF", "UTF-8", "ASCII");
+convertInvalidString("\xF0\x8F\xBF\xBF", "BAD+F08FBAD+BFBAD+BF", "UTF-8", "ASCII");
+mb_substitute_character(0x25); // '%'
+
 echo "== UTF-16 ==\n";

 testValidCodepoints("UTF-16");
@ -849,6 +887,29 @@ testInvalidCodepoints($invalid, 'UTF-16LE');
 testInvalidString("\x00", "\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');
 testInvalidString("A\x00\x01", "\x00\x00\x00A\x00\x00\x00%", 'UTF-16LE', 'UTF-32BE');

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString("\xDC\x01\xD8\x02", "BAD+DC01BAD+D802", "UTF-16", "ASCII");
+convertInvalidString("\xDC\x01\xD8\x02", "BAD+DC01BAD+D802", "UTF-16BE", "ASCII");
+convertInvalidString("\x01\xDC\x02\xD8", "BAD+DC01BAD+D802", "UTF-16LE", "ASCII");
+convertInvalidString("\xDD\x11\xD9\x13", "BAD+DD11BAD+D913", "UTF-16BE", "ASCII");
+
+convertInvalidString("\xD8\x01\x00A", "BAD+D801A", "UTF-16", "ASCII");
+convertInvalidString("\xD8\x01\x00A", "BAD+D801A", "UTF-16BE", "ASCII");
+convertInvalidString("\x01\xD8A\x00", "BAD+D801A", "UTF-16LE", "ASCII");
+
+convertInvalidString("\xD8\x01", "BAD+D801", "UTF-16", "ASCII");
+convertInvalidString("\xD8\x01", "BAD+D801", "UTF-16BE", "ASCII");
+convertInvalidString("\x01\xD8", "BAD+D801", "UTF-16LE", "ASCII");
+
+convertInvalidString("\x00", "BAD+0", 'UTF-16', 'ASCII');
+convertInvalidString("\x00", "BAD+0", 'UTF-16BE', 'ASCII');
+convertInvalidString("\x00", "BAD+0", 'UTF-16LE', 'ASCII');
+convertInvalidString("\x00A\x01", "ABAD+1", 'UTF-16', 'ASCII');
+convertInvalidString("\x00A\x01", "ABAD+1", 'UTF-16BE', 'ASCII');
+convertInvalidString("A\x00\x01", "ABAD+1", 'UTF-16LE', 'ASCII');
+mb_substitute_character(0x25); // '%'
+
 // TODO: test handling of UTF-16 BOM

 echo "== UTF-32 ==\n";
@ -905,6 +966,24 @@ testInvalidString("\x00\x01\x01", "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
 testInvalidString("\x00\x01",     "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');
 testInvalidString("\x00",         "\x00\x00\x00%", 'UTF-32LE', 'UTF-32BE');

+mb_substitute_character("long");
+convertInvalidString("\x00\x01\x01", "BAD+101", "UTF-32", "ASCII");
+convertInvalidString("\x00\x01\x01", "BAD+101", "UTF-32BE", "ASCII");
+convertInvalidString("\x01\x01\x00", "BAD+101", "UTF-32LE", "ASCII");
+
+convertInvalidString("\x01", "BAD+1", "UTF-32", "ASCII");
+convertInvalidString("\x01", "BAD+1", "UTF-32BE", "ASCII");
+convertInvalidString("\x01", "BAD+1", "UTF-32LE", "ASCII");
+
+convertInvalidString("\x00\x11\x00\x00", "BAD+110000", "UTF-32", "ASCII");
+convertInvalidString("\x00\x11\x00\x00", "BAD+110000", "UTF-32BE", "ASCII");
+convertInvalidString("\x00\x00\x11\x00", "BAD+110000", "UTF-32LE", "ASCII");
+
+convertInvalidString("\x00\x00\xd8\x00", "BAD+D800", "UTF-32", "ASCII");
+convertInvalidString("\x00\x00\xd8\x00", "BAD+D800", "UTF-32BE", "ASCII");
+convertInvalidString("\x00\xd8\x00\x00", "BAD+D800", "UTF-32LE", "ASCII");
+mb_substitute_character(0x25); // '%'
+
 // TODO: test handling of UTF-32 BOM

 echo "== UTF-7 ==\n";
@ -1012,6 +1091,28 @@ $encoded = encode("\x12\x34", 'UTF-16BE'); // 3 Base64 bytes, 2 bits of padding.
 $corrupted = substr($encoded, 0, 2) . chr(ord($encoded[2]) + 1);
 testInvalidString('+' . $corrupted . '-', "\x00\x00\x12\x34\x00\x00\x00%", 'UTF-7', 'UTF-32BE');

+// Test "long" illegal character markers
+mb_substitute_character("long");
+convertInvalidString('+' . rawEncode("\xDC\x01\xD8\x02") . '-', "BAD+DC01BAD+D802", "UTF-7", "UTF-8");
+convertInvalidString('+' . rawEncode("\xDC\x01\xD8\x02"), "BAD+DC01BAD+D802", "UTF-7", "UTF-8");
+convertInvalidString('+' . rawEncode("\x00\x2E\xDC\x01\xD8\x02") . '-', ".BAD+DC01BAD+D802", "UTF-7", "UTF-8");
+convertInvalidString('+' . rawEncode("\x00\x2E\xDC\x01\xD8\x02"), ".BAD+DC01BAD+D802", "UTF-7", "UTF-8");
+convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xDC\x01\xD8\x02") . '-', "..BAD+DC01BAD+D802", "UTF-7", "UTF-8");
+convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xDC\x01\xD8\x02"), "..BAD+DC01BAD+D802", "UTF-7", "UTF-8");
+
+convertInvalidString('+' . rawEncode("\xD8\x01\x00A") . '-', "BAD+D801A", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\x00A") . '-', ".BAD+D801A", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\x00A") . '-', "..BAD+D801A", 'UTF-7', 'UTF-8');
+
+convertInvalidString('+' . rawEncode("\xD8\x01\xD9\x02") . '-', "BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\xD8\x01\xD9\x02"), "BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\xD9\x02") . '-', ".BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\x00\x2E\xD8\x01\xD9\x02"), ".BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\xD9\x02") . '-', "..BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
+convertInvalidString('+' . rawEncode("\x00\x2E\x00\x2E\xD8\x01\xD9\x02"), "..BAD+D801BAD+D902", 'UTF-7', 'UTF-8');
+
+convertInvalidString('+' . rawEncode("\x01") . '-', "BAD+100", 'UTF-7', 'UTF-8');
+
 echo "Done!\n";

 ?>