Mark UTF-8 strings emitted by mbstring functions as valid UTF-8

We now have a couple of mbstring functions which have fast paths for strings marked as 'valid UTF-8'. Later, we may likely have more. So that these fast paths can be used more frequently, mark UTF-8 strings emitted by mbstring as 'valid UTF-8'. This is always a correct thing to do, because mbstring never returns invalid UTF-8 as the result of a conversion (or similar) operation. Internally, we do have a conversion mode which deliberately emits invalid UTF-8 in some cases. (This is done to prevent unwanted matches when we are converting strings to UTF-8 before performing matching operations on them.) For such strings, don't set the 'valid UTF-8' flag. It probably wouldn't hurt anything to set it, because strings generated using that special conversion mode should *never* be returned to userland, and I don't think we do anything with them which cares about the IS_STR_VALID_UTF8 flag... but still, it would likely cause confusion for developers.
2024-11-27 11:53:33 +08:00 · 2023-01-10 20:54:11 +02:00 · 2023-01-10 20:54:11 +02:00 · 4427b2e1ab
commit 4427b2e1ab
parent e7c0f4e816
5 changed files with 32 additions and 12 deletions
--- a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h
@ -47,4 +47,10 @@
 #define MBFL_QPRINT_STS_MIME_HEADER 0x1000000
 #define MBFL_BASE64_STS_MIME_HEADER 0x1000000

+#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0
+#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1
+#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2
+#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY 3
+#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8 4 /* For internal use only; deliberately uses invalid UTF-8 byte sequence as error marker */
+
 #endif /* MBFL_CONSTS_H */
--- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c
@ -365,7 +365,7 @@ zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encodi
 	}

 	*num_errors = buf.errors;
-	return mb_convert_buf_result(&buf);
+	return mb_convert_buf_result(&buf, to);
 }

 static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)
--- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
@ -32,6 +32,7 @@
 #define MBFL_ENCODING_H

 #include "mbfl_defs.h"
+#include "mbfl_consts.h"
 #include "zend.h"

 enum mbfl_no_encoding {
@ -208,7 +209,7 @@ static inline unsigned char* mb_convert_buf_add4(unsigned char *out, char c1, ch
 	return out;
 }

-static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf)
+static inline zend_string* mb_convert_buf_result_raw(mb_convert_buf *buf)
 {
 	ZEND_ASSERT(buf->out <= buf->limit);
 	zend_string *ret = buf->str;
@ -234,6 +235,17 @@ typedef struct {
 	mb_from_wchar_fn from_wchar;
 } mbfl_encoding;

+extern const mbfl_encoding mbfl_encoding_utf8;
+
+static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl_encoding *enc)
+{
+	zend_string *ret = mb_convert_buf_result_raw(buf);
+	if (enc == &mbfl_encoding_utf8 && buf->error_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
+		GC_ADD_FLAGS(ret, IS_STR_VALID_UTF8);
+	}
+	return ret;
+}
+
 MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
 MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding);
 MBFLAPI extern enum mbfl_no_encoding mbfl_name2no_encoding(const char *name);
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@ -1591,7 +1591,7 @@ PHP_FUNCTION(mb_output_handler)
 	}

 	MBSTRG(illegalchars) += buf.errors;
-	RETVAL_STR(mb_convert_buf_result(&buf));
+	RETVAL_STR(mb_convert_buf_result_raw(&buf));

 	if (last_feed) {
 		MBSTRG(outconv_enabled) = false;
@ -1679,7 +1679,7 @@ PHP_FUNCTION(mb_str_split)
 					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
 					i += split_len - char_count;
 					char_count = 0;
-					add_next_index_str(return_value, mb_convert_buf_result(&buf));
+					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
 				} else {
 					/* Output from this iteration is not enough to finish the next chunk;
 					 * output what we can, and leave 'buf' to be used again on next iteration */
@ -1696,7 +1696,7 @@ PHP_FUNCTION(mb_str_split)
 				if (out_len - i >= split_len) {
 					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
 					i += split_len;
-					add_next_index_str(return_value, mb_convert_buf_result(&buf));
+					add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
 				} else {
 					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
 					 * leave them for the next iteration */
@ -1710,7 +1710,7 @@ PHP_FUNCTION(mb_str_split)
 		if (char_count) {
 			/* The main loop above has finished processing the input string, but
 			 * has left a partial chunk in 'buf' */
-			add_next_index_str(return_value, mb_convert_buf_result(&buf));
+			add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
 		}
 	}
 }
@ -2076,7 +2076,7 @@ static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t
 		}
 	}

-	return mb_convert_buf_result(&buf);
+	return mb_convert_buf_result(&buf, enc);
 }

 static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
@ -2590,7 +2590,9 @@ append_trim_marker:
 		buf.out += ZSTR_LEN(marker);
 	}

-	return mb_convert_buf_result(&buf);
+	/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
+	 * we have no guarantee that the trim marker string is valid UTF-8 */
+	return mb_convert_buf_result_raw(&buf);
 }

 /* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
@ -3298,7 +3300,7 @@ emit_converted_kana:
 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
 	}

-	return mb_convert_buf_result(&buf);
+	return mb_convert_buf_result(&buf, encoding);
 }

 char mb_convert_kana_flags[17] = {
@ -3697,7 +3699,7 @@ static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_en
 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
 	}

-	return mb_convert_buf_result(&buf);
+	return mb_convert_buf_result(&buf, encoding);
 }

 /* {{{ Converts specified characters to HTML numeric entities */
@ -3929,7 +3931,7 @@ process_converted_wchars:
 		encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
 	}

-	return mb_convert_buf_result(&buf);
+	return mb_convert_buf_result(&buf, encoding);
 }

 /* {{{ Converts HTML numeric entities to character code */
--- a/ext/mbstring/php_unicode.c
+++ b/ext/mbstring/php_unicode.c
@ -366,5 +366,5 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
 		dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
 	}

-	return mb_convert_buf_result(&buf);
+	return mb_convert_buf_result(&buf, dst_encoding);
 }