Mark UTF-8 strings emitted by mbstring functions as valid UTF-8

We now have a couple of mbstring functions which have fast paths for
strings marked as 'valid UTF-8'. Later, we may likely have more. So
that these fast paths can be used more frequently, mark UTF-8 strings
emitted by mbstring as 'valid UTF-8'. This is always a correct thing
to do, because mbstring never returns invalid UTF-8 as the result of
a conversion (or similar) operation.

Internally, we do have a conversion mode which deliberately emits
invalid UTF-8 in some cases. (This is done to prevent unwanted matches
when we are converting strings to UTF-8 before performing matching
operations on them.) For such strings, don't set the 'valid UTF-8' flag.
It probably wouldn't hurt anything to set it, because strings generated
using that special conversion mode should *never* be returned to
userland, and I don't think we do anything with them which cares about
the IS_STR_VALID_UTF8 flag... but still, it would likely cause
confusion for developers.
This commit is contained in:
Alex Dowad 2023-01-10 20:54:11 +02:00
parent e7c0f4e816
commit 4427b2e1ab
5 changed files with 32 additions and 12 deletions

View File

@ -47,4 +47,10 @@
#define MBFL_QPRINT_STS_MIME_HEADER 0x1000000
#define MBFL_BASE64_STS_MIME_HEADER 0x1000000
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY 3
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8 4 /* For internal use only; deliberately uses invalid UTF-8 byte sequence as error marker */
#endif /* MBFL_CONSTS_H */

View File

@ -365,7 +365,7 @@ zend_string* mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encodi
}
*num_errors = buf.errors;
return mb_convert_buf_result(&buf);
return mb_convert_buf_result(&buf, to);
}
static uint32_t* convert_cp_to_hex(uint32_t cp, uint32_t *out)

View File

@ -32,6 +32,7 @@
#define MBFL_ENCODING_H
#include "mbfl_defs.h"
#include "mbfl_consts.h"
#include "zend.h"
enum mbfl_no_encoding {
@ -208,7 +209,7 @@ static inline unsigned char* mb_convert_buf_add4(unsigned char *out, char c1, ch
return out;
}
static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf)
static inline zend_string* mb_convert_buf_result_raw(mb_convert_buf *buf)
{
ZEND_ASSERT(buf->out <= buf->limit);
zend_string *ret = buf->str;
@ -234,6 +235,17 @@ typedef struct {
mb_from_wchar_fn from_wchar;
} mbfl_encoding;
extern const mbfl_encoding mbfl_encoding_utf8;
static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl_encoding *enc)
{
zend_string *ret = mb_convert_buf_result_raw(buf);
if (enc == &mbfl_encoding_utf8 && buf->error_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
GC_ADD_FLAGS(ret, IS_STR_VALID_UTF8);
}
return ret;
}
MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding);
MBFLAPI extern enum mbfl_no_encoding mbfl_name2no_encoding(const char *name);

View File

@ -1591,7 +1591,7 @@ PHP_FUNCTION(mb_output_handler)
}
MBSTRG(illegalchars) += buf.errors;
RETVAL_STR(mb_convert_buf_result(&buf));
RETVAL_STR(mb_convert_buf_result_raw(&buf));
if (last_feed) {
MBSTRG(outconv_enabled) = false;
@ -1679,7 +1679,7 @@ PHP_FUNCTION(mb_str_split)
enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
i += split_len - char_count;
char_count = 0;
add_next_index_str(return_value, mb_convert_buf_result(&buf));
add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
} else {
/* Output from this iteration is not enough to finish the next chunk;
* output what we can, and leave 'buf' to be used again on next iteration */
@ -1696,7 +1696,7 @@ PHP_FUNCTION(mb_str_split)
if (out_len - i >= split_len) {
enc->from_wchar(wchar_buf + i, split_len, &buf, true);
i += split_len;
add_next_index_str(return_value, mb_convert_buf_result(&buf));
add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
} else {
/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
* leave them for the next iteration */
@ -1710,7 +1710,7 @@ PHP_FUNCTION(mb_str_split)
if (char_count) {
/* The main loop above has finished processing the input string, but
* has left a partial chunk in 'buf' */
add_next_index_str(return_value, mb_convert_buf_result(&buf));
add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
}
}
}
@ -2076,7 +2076,7 @@ static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t
}
}
return mb_convert_buf_result(&buf);
return mb_convert_buf_result(&buf, enc);
}
static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
@ -2590,7 +2590,9 @@ append_trim_marker:
buf.out += ZSTR_LEN(marker);
}
return mb_convert_buf_result(&buf);
/* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
* we have no guarantee that the trim marker string is valid UTF-8 */
return mb_convert_buf_result_raw(&buf);
}
/* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
@ -3298,7 +3300,7 @@ emit_converted_kana:
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
}
return mb_convert_buf_result(&buf);
return mb_convert_buf_result(&buf, encoding);
}
char mb_convert_kana_flags[17] = {
@ -3697,7 +3699,7 @@ static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_en
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
}
return mb_convert_buf_result(&buf);
return mb_convert_buf_result(&buf, encoding);
}
/* {{{ Converts specified characters to HTML numeric entities */
@ -3929,7 +3931,7 @@ process_converted_wchars:
encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
}
return mb_convert_buf_result(&buf);
return mb_convert_buf_result(&buf, encoding);
}
/* {{{ Converts HTML numeric entities to character code */

View File

@ -366,5 +366,5 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
}
return mb_convert_buf_result(&buf);
return mb_convert_buf_result(&buf, dst_encoding);
}