2023-11-14 03:18:19 +08:00
|
|
|
/*
|
|
|
|
+----------------------------------------------------------------------+
|
|
|
|
| Copyright (c) The PHP Group |
|
|
|
|
+----------------------------------------------------------------------+
|
|
|
|
| This source file is subject to version 3.01 of the PHP license, |
|
|
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
|
|
| available through the world-wide-web at the following url: |
|
|
|
|
| https://www.php.net/license/3_01.txt |
|
|
|
|
| If you did not receive a copy of the PHP license and are unable to |
|
|
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
|
|
| license@php.net so we can mail you a copy immediately. |
|
|
|
|
+----------------------------------------------------------------------+
|
|
|
|
| Authors: Niels Dossche <nielsdos@php.net> |
|
|
|
|
+----------------------------------------------------------------------+
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* This file implements the MIME sniff algorithm from https://mimesniff.spec.whatwg.org/#parsing-a-mime-type (Date: 2023-09-27)
|
|
|
|
* It is a strict implementation of the algorithm, i.e. it does not accept malformed headers.
|
|
|
|
* In particular, it exposes php_dom_sniff_charset() to parse the charset from the Content-Type header.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "config.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "php.h"
|
|
|
|
#ifdef HAVE_LIBXML
|
|
|
|
|
|
|
|
#include "php_libxml.h"
|
|
|
|
|
|
|
|
static bool is_not_slash(char c)
|
|
|
|
{
|
|
|
|
return c != '/';
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_not_semicolon(char c)
|
|
|
|
{
|
|
|
|
return c != ';';
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_not_semicolon_or_equals(char c)
|
|
|
|
{
|
|
|
|
return c != ';' && c != '=';
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_not_quote_or_backslash(char c)
|
|
|
|
{
|
|
|
|
return c != '"' && c != '\\';
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://fetch.spec.whatwg.org/#http-tab-or-space */
|
|
|
|
static bool is_http_tab_or_space(char c)
|
|
|
|
{
|
|
|
|
return c == 0x09 || c == 0x20;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://fetch.spec.whatwg.org/#http-whitespace */
|
|
|
|
static bool is_http_whitespace(char c)
|
|
|
|
{
|
|
|
|
return c == 0x0A || c == 0x0D || is_http_tab_or_space(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point */
|
|
|
|
static bool is_http_quoted_string_token(unsigned char c) /* Note: unsigned is important to let the >= 0x20 check work properly! */
|
|
|
|
{
|
|
|
|
return c == 0x09 || (c >= 0x20 && c != 0x7F);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points
|
|
|
|
* Implemented by returning the length of the sequence */
|
|
|
|
static zend_always_inline size_t collect_a_sequence_of_code_points(const char *position, const char *end, bool (*condition)(char))
|
|
|
|
{
|
|
|
|
const char *start = position;
|
|
|
|
while (position < end && condition(*position)) {
|
|
|
|
position++;
|
|
|
|
}
|
|
|
|
return position - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://fetch.spec.whatwg.org/#collect-an-http-quoted-string with extract-value always true */
|
|
|
|
static zend_string *collect_an_http_quoted_string_with_extract_value(const char *position, const char *end, const char **position_out)
|
|
|
|
{
|
|
|
|
/* 1. Saving positionStart is not necessary, as in the extract-value == true variant we don't use it */
|
|
|
|
|
|
|
|
/* 2. Let value be the empty string */
|
|
|
|
zend_string *value = zend_string_alloc(end - position /* can't be longer than this */, false);
|
|
|
|
ZSTR_LEN(value) = 0;
|
|
|
|
|
|
|
|
/* 3. Assert */
|
|
|
|
ZEND_ASSERT(*position == '"');
|
|
|
|
|
|
|
|
/* 4. Advance */
|
|
|
|
position++;
|
|
|
|
|
|
|
|
/* 5. While true */
|
|
|
|
while (true) {
|
|
|
|
/* 5.1. Append the result of collect a sequence of code points that are not '"' or '\\' */
|
|
|
|
size_t length = collect_a_sequence_of_code_points(position, end, is_not_quote_or_backslash);
|
|
|
|
memcpy(ZSTR_VAL(value) + ZSTR_LEN(value), position, length);
|
|
|
|
ZSTR_LEN(value) += length;
|
|
|
|
position += length;
|
|
|
|
|
|
|
|
/* 5.2. Past end check */
|
|
|
|
if (position >= end) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 5.3. quoteOrBackslash is the code point at position */
|
|
|
|
char quote_or_backslash = *position;
|
|
|
|
|
|
|
|
/* 5.4. Advance */
|
|
|
|
position++;
|
|
|
|
|
|
|
|
/* 5.5. quote_or_backslash is '\\', deal with escaping */
|
|
|
|
if (quote_or_backslash == '\\') {
|
|
|
|
/* 5.5.1. Past end check */
|
|
|
|
if (position >= end) {
|
|
|
|
ZSTR_VAL(value)[ZSTR_LEN(value)] = '\\';
|
|
|
|
ZSTR_LEN(value)++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 5.5.2. Append code point at position */
|
|
|
|
ZSTR_VAL(value)[ZSTR_LEN(value)] = *position;
|
|
|
|
ZSTR_LEN(value)++;
|
|
|
|
|
|
|
|
/* 5.5.3. Advance */
|
|
|
|
position++;
|
|
|
|
} else {
|
|
|
|
/* 5.6. Otherwise: assert and break */
|
|
|
|
ZEND_ASSERT(quote_or_backslash == '"');
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ZSTR_VAL(value)[ZSTR_LEN(value)] = '\0';
|
|
|
|
|
|
|
|
*position_out = position;
|
|
|
|
|
|
|
|
/* 6. extract-value is always true, return value */
|
|
|
|
/* Step 7 is not needed because we always return here already */
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://infra.spec.whatwg.org/#ascii-alphanumeric */
|
|
|
|
static bool is_ascii_alpha_numeric(char c)
|
|
|
|
{
|
|
|
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://mimesniff.spec.whatwg.org/#http-token-code-point */
|
|
|
|
static bool is_http_token(char c)
|
|
|
|
{
|
|
|
|
return c == 0x21
|
|
|
|
|| (c >= 0x23 && c <= 0x27)
|
|
|
|
|| c == 0x2A || c == 0x2B || c == 0x2D || c == 0x2E
|
|
|
|
|| c == 0x5E || c == 0x5F
|
|
|
|
|| c == 0x60
|
|
|
|
|| c == 0x7C || c == 0x7E
|
|
|
|
|| is_ascii_alpha_numeric(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_empty_string_or_does_not_solely_contain_http_token_code_points(const char *start, size_t len)
|
|
|
|
{
|
|
|
|
if (len == 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
while (len > 0) {
|
|
|
|
if (!is_http_token(*start)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
len--;
|
|
|
|
start++;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool solely_contains_http_quoted_string_tokens(const char *start, size_t len)
|
|
|
|
{
|
|
|
|
while (len > 0) {
|
|
|
|
if (!is_http_quoted_string_token(*start)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
len--;
|
|
|
|
start++;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
|
|
|
|
* Note: We only care about the charset detection */
|
|
|
|
PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end)
|
|
|
|
{
|
|
|
|
/* 1. Remove leading & trailing HTTP whitespace */
|
|
|
|
while (start < end && is_http_whitespace(*start)) {
|
|
|
|
start++;
|
|
|
|
}
|
2023-12-13 00:44:36 +08:00
|
|
|
while (start < end && is_http_whitespace(*(end - 1))) {
|
2023-11-14 03:18:19 +08:00
|
|
|
end--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 2. Position variable: no-op because we move the start pointer instead */
|
|
|
|
|
|
|
|
/* 3. Collect sequence of code points that are not '/' (for type) */
|
|
|
|
size_t type_length = collect_a_sequence_of_code_points(start, end, is_not_slash);
|
|
|
|
|
|
|
|
/* 4. Empty string or not solely http tokens */
|
|
|
|
if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, type_length)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
start += type_length;
|
|
|
|
|
|
|
|
/* 5. Failure if past end of input (note: end is one past the last char; in practice this is only possible if no '/' was found) */
|
|
|
|
if (start >= end) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 6. Skip '/' */
|
|
|
|
start++;
|
|
|
|
|
|
|
|
/* 7. Collect sequence of code points that are not ';' (for subtype) */
|
|
|
|
size_t subtype_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
|
|
|
|
|
|
|
|
/* 8. Remove trailing HTTP whitespace from subtype, but we don't care about subtype, so no-op */
|
|
|
|
|
|
|
|
/* 9. Empty string or not solely http tokens */
|
|
|
|
if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, subtype_length)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
start += subtype_length;
|
|
|
|
|
|
|
|
/* 10. Initialise stuff, no-op as well as we don't care about anything other than charset */
|
|
|
|
|
|
|
|
/* 11. Loop with check: position not past end */
|
|
|
|
while (start < end) {
|
|
|
|
/* 11.1. Advance position */
|
|
|
|
start++;
|
|
|
|
|
|
|
|
/* 11.2. Collect sequence that *is* HTTP whitespace */
|
|
|
|
size_t whitespace_length = collect_a_sequence_of_code_points(start, end, is_http_whitespace);
|
|
|
|
start += whitespace_length;
|
|
|
|
|
|
|
|
/* 11.3. Collect a sequence of code points that are not ';' or '=' (for parameterName) */
|
|
|
|
size_t parameter_name_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon_or_equals);
|
|
|
|
const char *parameter_name = start;
|
|
|
|
start += parameter_name_length;
|
|
|
|
|
|
|
|
/* 11.4. Convert parameter_name to ASCII lowercase, no-op because we are only interested in charset which we'll match down below */
|
|
|
|
|
|
|
|
/* 11.5. Position past input check */
|
|
|
|
if (start < end) {
|
|
|
|
if (*start == ';') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
start++;
|
|
|
|
} else {
|
|
|
|
/* 11.6. */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 11.7. Let parameterValue be null */
|
|
|
|
zend_string *parameter_value = NULL;
|
|
|
|
|
|
|
|
/* 11.8. Quoted string check */
|
|
|
|
if (*start == '"') {
|
|
|
|
/* 11.8.1. Set parameterValue to the result of collecting an HTTP quoted string */
|
|
|
|
parameter_value = collect_an_http_quoted_string_with_extract_value(start, end, &start);
|
|
|
|
|
|
|
|
/* 11.8.2. Collect a sequence of code points that are not ';' */
|
|
|
|
start += collect_a_sequence_of_code_points(start, end, is_not_semicolon);
|
|
|
|
} else {
|
|
|
|
/* 11.9. Otherwise */
|
|
|
|
/* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */
|
|
|
|
size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
|
|
|
|
parameter_value = zend_string_init(start, parameter_value_length, false);
|
|
|
|
start += parameter_name_length;
|
|
|
|
|
|
|
|
/* 11.9.2. Remove trailing HTTP whitespace from parameterValue */
|
|
|
|
while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {
|
|
|
|
ZSTR_LEN(parameter_value)--;
|
|
|
|
}
|
|
|
|
ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value)] = '\0';
|
|
|
|
|
|
|
|
/* 11.9.3. Continue if parameterValue is empty */
|
|
|
|
if (ZSTR_LEN(parameter_value) == 0) {
|
|
|
|
zend_string_release_ex(parameter_value, false);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 11.10. We diverge from the spec here: we're only interested in charset.
|
|
|
|
* Furthermore, as only the first match matters, we can stop immediately with the loop once we set the charset. */
|
|
|
|
if (parameter_name_length == strlen("charset")
|
|
|
|
&& strncasecmp(parameter_name, "charset", strlen("charset")) == 0 /* Because of lowercasing in step 11.4 */
|
|
|
|
&& solely_contains_http_quoted_string_tokens(ZSTR_VAL(parameter_value), ZSTR_LEN(parameter_value))) {
|
|
|
|
return parameter_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
zend_string_release_ex(parameter_value, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 12. Return mimetype, a no-op / spec divergence */
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s)
|
|
|
|
{
|
|
|
|
if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
|
|
|
|
zval *header;
|
|
|
|
|
|
|
|
ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
|
|
|
|
const char buf[] = "Content-Type:";
|
|
|
|
if (Z_TYPE_P(header) == IS_STRING &&
|
|
|
|
!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
|
|
|
|
return php_libxml_sniff_charset_from_string(Z_STRVAL_P(header) + sizeof(buf) - 1, Z_STRVAL_P(header) + Z_STRLEN_P(header));
|
|
|
|
}
|
|
|
|
} ZEND_HASH_FOREACH_END();
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* HAVE_LIBXML */
|