php-src/ext/libxml/mime_sniff.c

/*
   +----------------------------------------------------------------------+
   | Copyright (c) The PHP Group                                          |
   +----------------------------------------------------------------------+
   | This source file is subject to version 3.01 of the PHP license,      |
   | that is bundled with this package in the file LICENSE, and is        |
   | available through the world-wide-web at the following url:           |
   | https://www.php.net/license/3_01.txt                                 |
   | If you did not receive a copy of the PHP license and are unable to   |
   | obtain it through the world-wide-web, please send a note to          |
   | license@php.net so we can mail you a copy immediately.               |
   +----------------------------------------------------------------------+
   | Authors: Niels Dossche <nielsdos@php.net>                            |
   +----------------------------------------------------------------------+
*/

/* This file implements the MIME sniff algorithm from https://mimesniff.spec.whatwg.org/#parsing-a-mime-type (Date: 2023-09-27)
 * It is a strict implementation of the algorithm, i.e. it does not accept malformed headers.
 * In particular, it exposes php_dom_sniff_charset() to parse the charset from the Content-Type header.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "php.h"
#ifdef HAVE_LIBXML

#include "php_libxml.h"

static bool is_not_slash(char c)
{
	return c != '/';
}

static bool is_not_semicolon(char c)
{
	return c != ';';
}

static bool is_not_semicolon_or_equals(char c)
{
	return c != ';' && c != '=';
}

static bool is_not_quote_or_backslash(char c)
{
	return c != '"' && c != '\\';
}

/* https://fetch.spec.whatwg.org/#http-tab-or-space */
static bool is_http_tab_or_space(char c)
{
	return c == 0x09 || c == 0x20;
}

/* https://fetch.spec.whatwg.org/#http-whitespace */
static bool is_http_whitespace(char c)
{
	return c == 0x0A || c == 0x0D || is_http_tab_or_space(c);
}

/* https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point */
static bool is_http_quoted_string_token(unsigned char c) /* Note: unsigned is important to let the >= 0x20 check work properly! */
{
	return c == 0x09 || (c >= 0x20 && c != 0x7F);
}

/* https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points
 * Implemented by returning the length of the sequence */
static zend_always_inline size_t collect_a_sequence_of_code_points(const char *position, const char *end, bool (*condition)(char))
{
	const char *start = position;
	while (position < end && condition(*position)) {
		position++;
	}
	return position - start;
}

/* https://fetch.spec.whatwg.org/#collect-an-http-quoted-string with extract-value always true */
static zend_string *collect_an_http_quoted_string_with_extract_value(const char *position, const char *end, const char **position_out)
{
	/* 1. Saving positionStart is not necessary, as in the extract-value == true variant we don't use it */

	/* 2. Let value be the empty string */
	zend_string *value = zend_string_alloc(end - position /* can't be longer than this */, false);
	ZSTR_LEN(value) = 0;

	/* 3. Assert */
	ZEND_ASSERT(*position == '"');

	/* 4. Advance */
	position++;

	/* 5. While true */
	while (true) {
		/* 5.1. Append the result of collect a sequence of code points that are not '"' or '\\' */
		size_t length = collect_a_sequence_of_code_points(position, end, is_not_quote_or_backslash);
		memcpy(ZSTR_VAL(value) + ZSTR_LEN(value), position, length);
		ZSTR_LEN(value) += length;
		position += length;

		/* 5.2. Past end check */
		if (position >= end) {
			break;
		}

		/* 5.3. quoteOrBackslash is the code point at position */
		char quote_or_backslash = *position;

		/* 5.4. Advance */
		position++;

		/* 5.5. quote_or_backslash is '\\', deal with escaping */
		if (quote_or_backslash == '\\') {
			/* 5.5.1. Past end check */
			if (position >= end) {
				ZSTR_VAL(value)[ZSTR_LEN(value)] = '\\';
				ZSTR_LEN(value)++;
				break;
			}

			/* 5.5.2. Append code point at position */
			ZSTR_VAL(value)[ZSTR_LEN(value)] = *position;
			ZSTR_LEN(value)++;

			/* 5.5.3. Advance */
			position++;
		} else {
			/* 5.6. Otherwise: assert and break */
			ZEND_ASSERT(quote_or_backslash == '"');
			break;
		}
	}

	ZSTR_VAL(value)[ZSTR_LEN(value)] = '\0';

	*position_out = position;

	/* 6. extract-value is always true, return value */
	/* Step 7 is not needed because we always return here already */
	return value;
}

/* https://infra.spec.whatwg.org/#ascii-alphanumeric */
static bool is_ascii_alpha_numeric(char c)
{
	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

/* https://mimesniff.spec.whatwg.org/#http-token-code-point */
static bool is_http_token(char c)
{
	return c == 0x21
		|| (c >= 0x23 && c <= 0x27)
		|| c == 0x2A || c == 0x2B || c == 0x2D || c == 0x2E
		|| c == 0x5E || c == 0x5F
		|| c == 0x60
		|| c == 0x7C || c == 0x7E
		|| is_ascii_alpha_numeric(c);
}

static bool is_empty_string_or_does_not_solely_contain_http_token_code_points(const char *start, size_t len)
{
	if (len == 0) {
		return true;
	}
	while (len > 0) {
		if (!is_http_token(*start)) {
			return true;
		}
		len--;
		start++;
	}
	return false;
}

static bool solely_contains_http_quoted_string_tokens(const char *start, size_t len)
{
	while (len > 0) {
		if (!is_http_quoted_string_token(*start)) {
			return false;
		}
		len--;
		start++;
	}
	return true;
}

/* https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
 * Note: We only care about the charset detection */
PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end)
{
	/* 1. Remove leading & trailing HTTP whitespace */
	while (start < end && is_http_whitespace(*start)) {
		start++;
	}
	while (start < end && is_http_whitespace(*(end - 1))) {
		end--;
	}

	/* 2. Position variable: no-op because we move the start pointer instead */

	/* 3. Collect sequence of code points that are not '/' (for type) */
	size_t type_length = collect_a_sequence_of_code_points(start, end, is_not_slash);

	/* 4. Empty string or not solely http tokens */
	if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, type_length)) {
		return NULL;
	}
	start += type_length;

	/* 5. Failure if past end of input (note: end is one past the last char; in practice this is only possible if no '/' was found) */
	if (start >= end) {
		return NULL;
	}

	/* 6. Skip '/' */
	start++;

	/* 7. Collect sequence of code points that are not ';' (for subtype) */
	size_t subtype_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);

	/* 8. Remove trailing HTTP whitespace from subtype, but we don't care about subtype, so no-op */

	/* 9. Empty string or not solely http tokens */
	if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, subtype_length)) {
		return NULL;
	}
	start += subtype_length;

	/* 10. Initialise stuff, no-op as well as we don't care about anything other than charset */

	/* 11. Loop with check: position not past end */
	while (start < end) {
		/* 11.1. Advance position */
		start++;

		/* 11.2. Collect sequence that *is* HTTP whitespace */
		size_t whitespace_length = collect_a_sequence_of_code_points(start, end, is_http_whitespace);
		start += whitespace_length;

		/* 11.3. Collect a sequence of code points that are not ';' or '=' (for parameterName) */
		size_t parameter_name_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon_or_equals);
		const char *parameter_name = start;
		start += parameter_name_length;

		/* 11.4. Convert parameter_name to ASCII lowercase, no-op because we are only interested in charset which we'll match down below */

		/* 11.5. Position past input check */
		if (start < end) {
			if (*start == ';') {
				continue;
			}
			start++;
		} else {
			/* 11.6. */
			break;
		}

		/* 11.7. Let parameterValue be null */
		zend_string *parameter_value = NULL;

		/* 11.8. Quoted string check */
		if (*start == '"') {
			/* 11.8.1. Set parameterValue to the result of collecting an HTTP quoted string */
			parameter_value = collect_an_http_quoted_string_with_extract_value(start, end, &start);

			/* 11.8.2. Collect a sequence of code points that are not ';' */
			start += collect_a_sequence_of_code_points(start, end, is_not_semicolon);
		} else {
			/* 11.9. Otherwise */
			/* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */
			size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
			parameter_value = zend_string_init(start, parameter_value_length, false);
			start += parameter_name_length;

			/* 11.9.2. Remove trailing HTTP whitespace from parameterValue */
			while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {
				ZSTR_LEN(parameter_value)--;
			}
			ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value)] = '\0';

			/* 11.9.3. Continue if parameterValue is empty */
			if (ZSTR_LEN(parameter_value) == 0) {
				zend_string_release_ex(parameter_value, false);
				continue;
			}
		}

		/* 11.10. We diverge from the spec here: we're only interested in charset.
		 *        Furthermore, as only the first match matters, we can stop immediately with the loop once we set the charset. */
		if (parameter_name_length == strlen("charset")
			&& strncasecmp(parameter_name, "charset", strlen("charset")) == 0 /* Because of lowercasing in step 11.4 */
			&& solely_contains_http_quoted_string_tokens(ZSTR_VAL(parameter_value), ZSTR_LEN(parameter_value))) {
			return parameter_value;
		}

		zend_string_release_ex(parameter_value, false);
	}

	/* 12. Return mimetype, a no-op / spec divergence */
	return NULL;
}

PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s)
{
	if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
		zval *header;

		ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
			const char buf[] = "Content-Type:";
			if (Z_TYPE_P(header) == IS_STRING &&
					!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
				return php_libxml_sniff_charset_from_string(Z_STRVAL_P(header) + sizeof(buf) - 1, Z_STRVAL_P(header) + Z_STRLEN_P(header));
			}
		} ZEND_HASH_FOREACH_END();
	}

	return NULL;
}

#endif  /* HAVE_LIBXML */
[RFC] DOM HTML5 parsing and serialization support (#12111) 2023-11-14 03:18:19 +08:00			`/*`
			`+----------------------------------------------------------------------+`
			`\| Copyright (c) The PHP Group \|`
			`+----------------------------------------------------------------------+`
			`\| This source file is subject to version 3.01 of the PHP license, \|`
			`\| that is bundled with this package in the file LICENSE, and is \|`
			`\| available through the world-wide-web at the following url: \|`
			`\| https://www.php.net/license/3_01.txt \|`
			`\| If you did not receive a copy of the PHP license and are unable to \|`
			`\| obtain it through the world-wide-web, please send a note to \|`
			`\| license@php.net so we can mail you a copy immediately. \|`
			`+----------------------------------------------------------------------+`
			`\| Authors: Niels Dossche <nielsdos@php.net> \|`
			`+----------------------------------------------------------------------+`
			`*/`

			`/* This file implements the MIME sniff algorithm from https://mimesniff.spec.whatwg.org/#parsing-a-mime-type (Date: 2023-09-27)`
			`* It is a strict implementation of the algorithm, i.e. it does not accept malformed headers.`
			`* In particular, it exposes php_dom_sniff_charset() to parse the charset from the Content-Type header.`
			`*/`

			`#ifdef HAVE_CONFIG_H`
			`#include "config.h"`
			`#endif`

			`#include "php.h"`
			`#ifdef HAVE_LIBXML`

			`#include "php_libxml.h"`

			`static bool is_not_slash(char c)`
			`{`
			`return c != '/';`
			`}`

			`static bool is_not_semicolon(char c)`
			`{`
			`return c != ';';`
			`}`

			`static bool is_not_semicolon_or_equals(char c)`
			`{`
			`return c != ';' && c != '=';`
			`}`

			`static bool is_not_quote_or_backslash(char c)`
			`{`
			`return c != '"' && c != '\\';`
			`}`

			`/* https://fetch.spec.whatwg.org/#http-tab-or-space */`
			`static bool is_http_tab_or_space(char c)`
			`{`
			`return c == 0x09 \|\| c == 0x20;`
			`}`

			`/* https://fetch.spec.whatwg.org/#http-whitespace */`
			`static bool is_http_whitespace(char c)`
			`{`
			`return c == 0x0A \|\| c == 0x0D \|\| is_http_tab_or_space(c);`
			`}`

			`/* https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point */`
			`static bool is_http_quoted_string_token(unsigned char c) /* Note: unsigned is important to let the >= 0x20 check work properly! */`
			`{`
			`return c == 0x09 \|\| (c >= 0x20 && c != 0x7F);`
			`}`

			`/* https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points`
			`* Implemented by returning the length of the sequence */`
			`static zend_always_inline size_t collect_a_sequence_of_code_points(const char position, const char end, bool (*condition)(char))`
			`{`
			`const char *start = position;`
			`while (position < end && condition(*position)) {`
			`position++;`
			`}`
			`return position - start;`
			`}`

			`/* https://fetch.spec.whatwg.org/#collect-an-http-quoted-string with extract-value always true */`
			`static zend_string collect_an_http_quoted_string_with_extract_value(const char position, const char end, const char *position_out)`
			`{`
			`/* 1. Saving positionStart is not necessary, as in the extract-value == true variant we don't use it */`

			`/* 2. Let value be the empty string */`
			`zend_string value = zend_string_alloc(end - position / can't be longer than this */, false);`
			`ZSTR_LEN(value) = 0;`

			`/* 3. Assert */`
			`ZEND_ASSERT(*position == '"');`

			`/* 4. Advance */`
			`position++;`

			`/* 5. While true */`
			`while (true) {`
			`/* 5.1. Append the result of collect a sequence of code points that are not '"' or '\\' */`
			`size_t length = collect_a_sequence_of_code_points(position, end, is_not_quote_or_backslash);`
			`memcpy(ZSTR_VAL(value) + ZSTR_LEN(value), position, length);`
			`ZSTR_LEN(value) += length;`
			`position += length;`

			`/* 5.2. Past end check */`
			`if (position >= end) {`
			`break;`
			`}`

			`/* 5.3. quoteOrBackslash is the code point at position */`
			`char quote_or_backslash = *position;`

			`/* 5.4. Advance */`
			`position++;`

			`/* 5.5. quote_or_backslash is '\\', deal with escaping */`
			`if (quote_or_backslash == '\\') {`
			`/* 5.5.1. Past end check */`
			`if (position >= end) {`
			`ZSTR_VAL(value)[ZSTR_LEN(value)] = '\\';`
			`ZSTR_LEN(value)++;`
			`break;`
			`}`

			`/* 5.5.2. Append code point at position */`
			`ZSTR_VAL(value)[ZSTR_LEN(value)] = *position;`
			`ZSTR_LEN(value)++;`

			`/* 5.5.3. Advance */`
			`position++;`
			`} else {`
			`/* 5.6. Otherwise: assert and break */`
			`ZEND_ASSERT(quote_or_backslash == '"');`
			`break;`
			`}`
			`}`

			`ZSTR_VAL(value)[ZSTR_LEN(value)] = '\0';`

			`*position_out = position;`

			`/* 6. extract-value is always true, return value */`
			`/* Step 7 is not needed because we always return here already */`
			`return value;`
			`}`

			`/* https://infra.spec.whatwg.org/#ascii-alphanumeric */`
			`static bool is_ascii_alpha_numeric(char c)`
			`{`
			`return (c >= '0' && c <= '9') \|\| (c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z');`
			`}`

			`/* https://mimesniff.spec.whatwg.org/#http-token-code-point */`
			`static bool is_http_token(char c)`
			`{`
			`return c == 0x21`
			`\|\| (c >= 0x23 && c <= 0x27)`
			`\|\| c == 0x2A \|\| c == 0x2B \|\| c == 0x2D \|\| c == 0x2E`
			`\|\| c == 0x5E \|\| c == 0x5F`
			`\|\| c == 0x60`
			`\|\| c == 0x7C \|\| c == 0x7E`
			`\|\| is_ascii_alpha_numeric(c);`
			`}`

			`static bool is_empty_string_or_does_not_solely_contain_http_token_code_points(const char *start, size_t len)`
			`{`
			`if (len == 0) {`
			`return true;`
			`}`
			`while (len > 0) {`
			`if (!is_http_token(*start)) {`
			`return true;`
			`}`
			`len--;`
			`start++;`
			`}`
			`return false;`
			`}`

			`static bool solely_contains_http_quoted_string_tokens(const char *start, size_t len)`
			`{`
			`while (len > 0) {`
			`if (!is_http_quoted_string_token(*start)) {`
			`return false;`
			`}`
			`len--;`
			`start++;`
			`}`
			`return true;`
			`}`

			`/* https://mimesniff.spec.whatwg.org/#parsing-a-mime-type`
			`* Note: We only care about the charset detection */`
			`PHP_LIBXML_API zend_string php_libxml_sniff_charset_from_string(const char start, const char *end)`
			`{`
			`/* 1. Remove leading & trailing HTTP whitespace */`
			`while (start < end && is_http_whitespace(*start)) {`
			`start++;`
			`}`
More testing of mime_sniff and fix off-by-one causing trailing whitespace to not be always stripped (#12935) 2023-12-13 00:44:36 +08:00			`while (start < end && is_http_whitespace(*(end - 1))) {`
[RFC] DOM HTML5 parsing and serialization support (#12111) 2023-11-14 03:18:19 +08:00			`end--;`
			`}`

			`/* 2. Position variable: no-op because we move the start pointer instead */`

			`/* 3. Collect sequence of code points that are not '/' (for type) */`
			`size_t type_length = collect_a_sequence_of_code_points(start, end, is_not_slash);`

			`/* 4. Empty string or not solely http tokens */`
			`if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, type_length)) {`
			`return NULL;`
			`}`
			`start += type_length;`

			`/* 5. Failure if past end of input (note: end is one past the last char; in practice this is only possible if no '/' was found) */`
			`if (start >= end) {`
			`return NULL;`
			`}`

			`/* 6. Skip '/' */`
			`start++;`

			`/* 7. Collect sequence of code points that are not ';' (for subtype) */`
			`size_t subtype_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);`

			`/* 8. Remove trailing HTTP whitespace from subtype, but we don't care about subtype, so no-op */`

			`/* 9. Empty string or not solely http tokens */`
			`if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, subtype_length)) {`
			`return NULL;`
			`}`
			`start += subtype_length;`

			`/* 10. Initialise stuff, no-op as well as we don't care about anything other than charset */`

			`/* 11. Loop with check: position not past end */`
			`while (start < end) {`
			`/* 11.1. Advance position */`
			`start++;`

			`/* 11.2. Collect sequence that is HTTP whitespace */`
			`size_t whitespace_length = collect_a_sequence_of_code_points(start, end, is_http_whitespace);`
			`start += whitespace_length;`

			`/* 11.3. Collect a sequence of code points that are not ';' or '=' (for parameterName) */`
			`size_t parameter_name_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon_or_equals);`
			`const char *parameter_name = start;`
			`start += parameter_name_length;`

			`/* 11.4. Convert parameter_name to ASCII lowercase, no-op because we are only interested in charset which we'll match down below */`

			`/* 11.5. Position past input check */`
			`if (start < end) {`
			`if (*start == ';') {`
			`continue;`
			`}`
			`start++;`
			`} else {`
			`/* 11.6. */`
			`break;`
			`}`

			`/* 11.7. Let parameterValue be null */`
			`zend_string *parameter_value = NULL;`

			`/* 11.8. Quoted string check */`
			`if (*start == '"') {`
			`/* 11.8.1. Set parameterValue to the result of collecting an HTTP quoted string */`
			`parameter_value = collect_an_http_quoted_string_with_extract_value(start, end, &start);`

			`/* 11.8.2. Collect a sequence of code points that are not ';' */`
			`start += collect_a_sequence_of_code_points(start, end, is_not_semicolon);`
			`} else {`
			`/* 11.9. Otherwise */`
			`/* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */`
			`size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);`
			`parameter_value = zend_string_init(start, parameter_value_length, false);`
			`start += parameter_name_length;`

			`/* 11.9.2. Remove trailing HTTP whitespace from parameterValue */`
			`while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {`
			`ZSTR_LEN(parameter_value)--;`
			`}`
			`ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value)] = '\0';`

			`/* 11.9.3. Continue if parameterValue is empty */`
			`if (ZSTR_LEN(parameter_value) == 0) {`
			`zend_string_release_ex(parameter_value, false);`
			`continue;`
			`}`
			`}`

			`/* 11.10. We diverge from the spec here: we're only interested in charset.`
			`* Furthermore, as only the first match matters, we can stop immediately with the loop once we set the charset. */`
			`if (parameter_name_length == strlen("charset")`
			`&& strncasecmp(parameter_name, "charset", strlen("charset")) == 0 /* Because of lowercasing in step 11.4 */`
			`&& solely_contains_http_quoted_string_tokens(ZSTR_VAL(parameter_value), ZSTR_LEN(parameter_value))) {`
			`return parameter_value;`
			`}`

			`zend_string_release_ex(parameter_value, false);`
			`}`

			`/* 12. Return mimetype, a no-op / spec divergence */`
			`return NULL;`
			`}`

			`PHP_LIBXML_API zend_string php_libxml_sniff_charset_from_stream(const php_stream s)`
			`{`
			`if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {`
			`zval *header;`

			`ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {`
			`const char buf[] = "Content-Type:";`
			`if (Z_TYPE_P(header) == IS_STRING &&`
			`!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {`
			`return php_libxml_sniff_charset_from_string(Z_STRVAL_P(header) + sizeof(buf) - 1, Z_STRVAL_P(header) + Z_STRLEN_P(header));`
			`}`
			`} ZEND_HASH_FOREACH_END();`
			`}`

			`return NULL;`
			`}`

			`#endif /* HAVE_LIBXML */`