From 1a512eed449128334edf0329b72e53c5caaaa95b Mon Sep 17 00:00:00 2001 From: Andrea Faulds Date: Thu, 13 Oct 2016 23:33:33 +0100 Subject: [PATCH] Move utf8_encode and utf8_decode to ext/standard --- ext/standard/basic_functions.c | 10 ++ ext/standard/php_string.h | 2 + ext/standard/string.c | 94 +++++++++++++++++++ .../tests/strings}/bug43957.phpt | 5 - .../tests/strings}/bug49687.phpt | 5 - .../tests/strings/utf8.phpt} | 2 - .../tests/strings}/utf8_decode_error.phpt | 8 +- .../strings}/utf8_decode_variation1.phpt | 8 +- .../tests/strings}/utf8_encode_error.phpt | 8 +- .../strings}/utf8_encode_variation1.phpt | 8 +- ext/xml/xml.c | 50 ---------- 11 files changed, 110 insertions(+), 90 deletions(-) rename ext/{xml/tests => standard/tests/strings}/bug43957.phpt (58%) rename ext/{xml/tests => standard/tests/strings}/bug49687.phpt (70%) rename ext/{xml/tests/xml006.phpt => standard/tests/strings/utf8.phpt} (85%) rename ext/{xml/tests => standard/tests/strings}/utf8_decode_error.phpt (88%) rename ext/{xml/tests => standard/tests/strings}/utf8_decode_variation1.phpt (95%) rename ext/{xml/tests => standard/tests/strings}/utf8_encode_error.phpt (88%) rename ext/{xml/tests => standard/tests/strings}/utf8_encode_variation1.phpt (95%) diff --git a/ext/standard/basic_functions.c b/ext/standard/basic_functions.c index 13e8a4e6eb8..d528e519083 100644 --- a/ext/standard/basic_functions.c +++ b/ext/standard/basic_functions.c @@ -2465,6 +2465,14 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_substr_compare, 0, 0, 3) ZEND_ARG_INFO(0, length) ZEND_ARG_INFO(0, case_sensitivity) ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_utf8_encode, 0, 0, 1) + ZEND_ARG_INFO(0, data) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_utf8_decode, 0, 0, 1) + ZEND_ARG_INFO(0, data) +ZEND_END_ARG_INFO() /* }}} */ /* {{{ syslog.c */ #ifdef HAVE_SYSLOG_H @@ -2764,6 +2772,8 @@ const zend_function_entry basic_functions[] = { /* {{{ */ PHP_FE(str_split, arginfo_str_split) PHP_FE(strpbrk, arginfo_strpbrk) PHP_FE(substr_compare, arginfo_substr_compare) + PHP_FE(utf8_encode, arginfo_utf8_encode) + PHP_FE(utf8_decode, arginfo_utf8_decode) #ifdef HAVE_STRCOLL PHP_FE(strcoll, arginfo_strcoll) diff --git a/ext/standard/php_string.h b/ext/standard/php_string.h index 14b66e7e131..6fc75871216 100644 --- a/ext/standard/php_string.h +++ b/ext/standard/php_string.h @@ -93,6 +93,8 @@ PHP_FUNCTION(str_word_count); PHP_FUNCTION(str_split); PHP_FUNCTION(strpbrk); PHP_FUNCTION(substr_compare); +PHP_FUNCTION(utf8_encode); +PHP_FUNCTION(utf8_decode); #ifdef HAVE_STRCOLL PHP_FUNCTION(strcoll); #endif diff --git a/ext/standard/string.c b/ext/standard/string.c index fa59ddd06f7..4389e107026 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -64,6 +64,8 @@ /* For str_getcsv() support */ #include "ext/standard/file.h" +/* For php_next_utf8_char() */ +#include "ext/standard/html.h" #define STR_PAD_LEFT 0 #define STR_PAD_RIGHT 1 @@ -5653,6 +5655,98 @@ PHP_FUNCTION(substr_compare) } /* }}} */ +/* {{{ */ +static zend_string *php_utf8_encode(const char *s, size_t len) +{ + size_t pos = len; + zend_string *str; + unsigned char c; + + str = zend_string_safe_alloc(len, 2, 0, 0); + ZSTR_LEN(str) = 0; + while (pos > 0) { + /* The lower 256 codepoints of Unicode are identical to Latin-1, + * so we don't need to do any mapping here. */ + c = (unsigned char)(*s); + if (c < 0x80) { + ZSTR_VAL(str)[ZSTR_LEN(str)++] = (char) c; + /* We only account for the single-byte and two-byte cases because + * we're only dealing with the first 256 Unicode codepoints. */ + } else { + ZSTR_VAL(str)[ZSTR_LEN(str)++] = (0xc0 | (c >> 6)); + ZSTR_VAL(str)[ZSTR_LEN(str)++] = (0x80 | (c & 0x3f)); + } + pos--; + s++; + } + ZSTR_VAL(str)[ZSTR_LEN(str)] = '\0'; + str = zend_string_truncate(str, ZSTR_LEN(str), 0); + return str; +} +/* }}} */ + +/* {{{ */ +static zend_string *php_utf8_decode(const char *s, size_t len) +{ + size_t pos = 0; + unsigned int c; + zend_string *str; + + str = zend_string_alloc(len, 0); + ZSTR_LEN(str) = 0; + while (pos < len) { + int status = FAILURE; + c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status); + + /* The lower 256 codepoints of Unicode are identical to Latin-1, + * so we don't need to do any mapping here beyond replacing non-Latin-1 + * characters. */ + if (status == FAILURE || c > 0xFFU) { + c = '?'; + } + + ZSTR_VAL(str)[ZSTR_LEN(str)++] = c; + } + ZSTR_VAL(str)[ZSTR_LEN(str)] = '\0'; + if (ZSTR_LEN(str) < len) { + str = zend_string_truncate(str, ZSTR_LEN(str), 0); + } + + return str; +} +/* }}} */ + + +/* {{{ proto string utf8_encode(string data) + Encodes an ISO-8859-1 string to UTF-8 */ +PHP_FUNCTION(utf8_encode) +{ + char *arg; + size_t arg_len; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE) { + return; + } + + RETURN_STR(php_utf8_encode(arg, arg_len)); +} +/* }}} */ + +/* {{{ proto string utf8_decode(string data) + Converts a UTF-8 encoded string to ISO-8859-1 */ +PHP_FUNCTION(utf8_decode) +{ + char *arg; + size_t arg_len; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &arg, &arg_len) == FAILURE) { + return; + } + + RETURN_STR(php_utf8_decode(arg, arg_len)); +} +/* }}} */ + /* * Local variables: * tab-width: 4 diff --git a/ext/xml/tests/bug43957.phpt b/ext/standard/tests/strings/bug43957.phpt similarity index 58% rename from ext/xml/tests/bug43957.phpt rename to ext/standard/tests/strings/bug43957.phpt index f11d15627be..0380787b73c 100644 --- a/ext/xml/tests/bug43957.phpt +++ b/ext/standard/tests/strings/bug43957.phpt @@ -1,10 +1,5 @@ --TEST-- Bug #43957 (utf8_decode() bogus conversion on multibyte indicator near end of string) ---SKIPIF-- - --FILE-- --FILE-- ISO Latin 1 encoding/decoding test ---SKIPIF-- - --FILE-- %s\n", urlencode("æ"), urlencode(utf8_encode("æ"))); diff --git a/ext/xml/tests/utf8_decode_error.phpt b/ext/standard/tests/strings/utf8_decode_error.phpt similarity index 88% rename from ext/xml/tests/utf8_decode_error.phpt rename to ext/standard/tests/strings/utf8_decode_error.phpt index 8735fd82f6c..911cc15cfcb 100644 --- a/ext/xml/tests/utf8_decode_error.phpt +++ b/ext/standard/tests/strings/utf8_decode_error.phpt @@ -1,16 +1,10 @@ --TEST-- Test utf8_decode() function : error conditions ---SKIPIF-- - --FILE-- --FILE-- --FILE-- ---SKIPIF-- - --FILE--