Unicode support for str_replace() and str_ireplace().

# This was not trivial.
This commit is contained in:
Andrei Zmievski 2006-10-05 22:40:38 +00:00
parent 32c3bf91e3
commit 0decd2d4e7
2 changed files with 374 additions and 95 deletions

View File

@ -4877,6 +4877,9 @@ PHPAPI int php_char_to_str_ex(char *str, uint len, char from, char *to, int to_l
target += to_len;
p++;
s = p;
if (replace_count) {
*replace_count += 1;
}
}
if (s < e) {
memcpy(target, s, (e - s));
@ -4912,6 +4915,53 @@ PHPAPI int php_char_to_str(char *str, uint len, char from, char *to, int to_len,
}
/* }}} */
/* {{{ php_u_char_to_str_ex */
PHPAPI int php_u_char_to_str_ex(UChar *str, uint len, UChar from, UChar *to, int to_len, zval *result, int *replace_count)
{
int char_count = 0;
int replaced = 0;
UChar *target;
UChar *p, *e, *s;
p = str;
e = p + len;
while ((p = u_memchr(p, from, (e - p)))) {
char_count++;
p++;
}
if (char_count == 0) {
ZVAL_UNICODEL(result, str, len, 1);
return 0;
}
Z_USTRLEN_P(result) = len + (char_count * (to_len - 1));
Z_USTRVAL_P(result) = target = eumalloc(Z_USTRLEN_P(result) + 1);
Z_TYPE_P(result) = IS_UNICODE;
p = str;
e = p + len;
s = str;
while ((p = u_memchr(p, from, (e - p)))) {
u_memcpy(target, s, (p - s));
target += p - s;
u_memcpy(target, to, to_len);
target += to_len;
p++;
s = p;
if (replace_count) {
*replace_count += 1;
}
}
if (s < e) {
u_memcpy(target, s, (e - s));
target += e - s;
}
*target = 0;
return replaced;
}
/* }}} */
/* {{{ php_str_to_str_ex
*/
PHPAPI char *php_str_to_str_ex(char *haystack, int length,
@ -5067,31 +5117,234 @@ nothing_todo:
}
/* }}} */
/* {{{ php_str_to_str
*/
/* {{{ php_str_to_str */
PHPAPI char *php_str_to_str(char *haystack, int length,
char *needle, int needle_len, char *str, int str_len, int *_new_length)
{
return php_str_to_str_ex(haystack, length, needle, needle_len, str, str_len, _new_length, 1, NULL);
}
/* }}}
*/
/* }}} */
/* {{{ php_u_str_to_str_ex */
PHPAPI UChar *php_u_str_to_str_ex(UChar *haystack, int length,
UChar *needle, int needle_len, UChar *repl, int repl_len, int *_new_length, int *replace_count)
{
UChar *new_str;
if (needle_len < length) {
UChar *end;
UChar *e, *s, *p, *r;
if (needle_len == repl_len) {
new_str = eustrndup(haystack, length);
*_new_length = length;
end = new_str + length;
for (p = new_str; (r = zend_u_memnstr(p, needle, needle_len, end)); p = r + needle_len) {
u_memcpy(r, repl, repl_len);
if (replace_count) {
(*replace_count)++;
}
}
return new_str;
} else {
if (repl_len < needle_len) {
new_str = eumalloc(length + 1);
} else {
int count = 0;
UChar *o, *n, *endp;
o = haystack;
n = needle;
endp = o + length;
while ((o = zend_u_memnstr(o, n, needle_len, endp))) {
o += needle_len;
count++;
}
if (count == 0) {
/* Needle doesn't occur, shortcircuit the actual replacement. */
new_str = eustrndup(haystack, length);
if (_new_length) {
*_new_length = length;
}
return new_str;
} else {
new_str = safe_emalloc(count, UBYTES(repl_len - needle_len), UBYTES(length + 1));
}
}
e = s = new_str;
end = haystack + length;
for (p = haystack; (r = zend_u_memnstr(p, needle, needle_len, end)); p = r + needle_len) {
u_memcpy(e, p, r - p);
e += r - p;
u_memcpy(e, repl, repl_len);
e += repl_len;
if (replace_count) {
(*replace_count)++;
}
}
if (p < end) {
u_memcpy(e, p, end - p);
e += end - p;
}
*e = 0;
*_new_length = e - s;
new_str = eurealloc(new_str, *_new_length + 1);
return new_str;
}
} else if (needle_len > length) {
nothing_todo:
*_new_length = length;
new_str = eustrndup(haystack, length);
return new_str;
} else {
if (u_strncmp(haystack, needle, length)) {
goto nothing_todo;
} else {
*_new_length = repl_len;
new_str = eustrndup(repl, repl_len);
if (replace_count) {
(*replace_count)++;
}
return new_str;
}
}
}
/* }}} */
/* {{{ php_u_str_to_str_case_ex */
PHPAPI UChar *php_u_str_to_str_case_ex(UChar *str, int str_len,
UChar *pat, int pat_len, UChar *repl, int repl_len, int *result_len, int *replace_count TSRMLS_DC)
{
UChar *str_fold, *pat_fold, *result;
int str_fold_len, pat_fold_len, alloc_len;
UChar *p, *found, *end;
int offset, found_len;
UErrorCode status = U_ZERO_ERROR;
zend_case_fold_string(&str_fold, &str_fold_len, str, str_len, U_FOLD_CASE_DEFAULT, &status);
if (str_fold_len == str_len) {
alloc_len = str_len;
result = eumalloc(alloc_len + 1);
*result_len = 0;
zend_case_fold_string(&pat_fold, &pat_fold_len, pat, pat_len, U_FOLD_CASE_DEFAULT, &status);
end = str_fold + str_fold_len;
for (p = str_fold;
(found = u_strFindFirst(p, end - p, pat_fold, pat_fold_len));
p = found + pat_fold_len) { /* we can increment by pattern length since the match in
the subject string is guaranteed to be of the same length */
if (*result_len + (found - p + repl_len) > alloc_len) {
alloc_len += (alloc_len >> 1); /* grow by 1.5x factor */
result = eurealloc(result, alloc_len + 1);
}
u_memcpy(result + *result_len, str + (p - str_fold), found - p);
*result_len += found - p;
u_memcpy(result + *result_len, repl, repl_len);
*result_len += repl_len;
if (replace_count) {
(*replace_count)++;
}
}
/* add remaining chunk, if any */
if (p < end) {
int chunk_len = end - p;
if (*result_len + chunk_len > alloc_len) {
alloc_len += *result_len + chunk_len - alloc_len;
result = eurealloc(result, alloc_len + 1);
}
u_memcpy(result + *result_len, str + (p - str_fold), chunk_len);
*result_len += chunk_len;
}
result[*result_len] = 0;
if (*result_len < alloc_len) {
result = eurealloc(result, *result_len + 1);
}
efree(pat_fold);
} else {
usearch_setText(UG(root_search), str, str_len, &status);
usearch_setPattern(UG(root_search), pat, pat_len, &status);
usearch_setOffset(UG(root_search), 0, &status);
alloc_len = str_len;
result = eumalloc(alloc_len + 1);
*result_len = 0;
p = str;
end = str + str_len;
for (offset = usearch_first(UG(root_search), &status);
offset != USEARCH_DONE;
offset = usearch_next(UG(root_search), &status)) {
found = str + offset;
/* matched length is not simply pattern length */
found_len = usearch_getMatchedLength(UG(root_search));
if (*result_len + (found - p + repl_len) > alloc_len) {
alloc_len += (alloc_len >> 1); /* grow by 1.5x factor */
result = eurealloc(result, alloc_len + 1);
}
u_memcpy(result + *result_len, p, found - p);
*result_len += found - p;
u_memcpy(result + *result_len, repl, repl_len);
*result_len += repl_len;
p = found + found_len;
if (replace_count) {
(*replace_count)++;
}
}
/* add remaining chunk, if any */
if (p < end) {
int chunk_len = end - p;
if (*result_len + chunk_len > alloc_len) {
alloc_len += *result_len + chunk_len - alloc_len;
result = eurealloc(result, alloc_len + 1);
}
u_memcpy(result + *result_len, p, chunk_len);
*result_len += chunk_len;
}
result[*result_len] = 0;
if (*result_len < alloc_len) {
result = eurealloc(result, *result_len + 1);
}
}
efree(str_fold);
return result;
}
/* }}} */
/* {{{ php_str_replace_in_subject
*/
static void php_str_replace_in_subject(zval *search, zval *replace, zval **subject, zval *result, int case_sensitivity, int *replace_count)
static void php_str_replace_in_subject(zval *search, zval *replace, zval **subject, zval *result, int case_sensitivity, int *replace_count TSRMLS_DC)
{
zval **search_entry,
**replace_entry = NULL,
temp_result;
char *replace_value = NULL;
zstr replace_value = NULL_ZSTR;
int replace_len = 0;
/* Make sure we're dealing with strings. */
convert_to_string_ex(subject);
Z_TYPE_P(result) = IS_STRING;
if (Z_STRLEN_PP(subject) == 0) {
ZVAL_STRINGL(result, "", 0, 1);
convert_to_text_ex(subject);
Z_TYPE_P(result) = ZEND_STR_TYPE;
if (Z_UNILEN_PP(subject) == 0) {
ZVAL_EMPTY_TEXT(result);
return;
}
@ -5108,16 +5361,16 @@ static void php_str_replace_in_subject(zval *search, zval *replace, zval **subje
zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
} else {
/* Set replacement value to the passed one */
replace_value = Z_STRVAL_P(replace);
replace_len = Z_STRLEN_P(replace);
replace_value = Z_UNIVAL_P(replace);
replace_len = Z_UNILEN_P(replace);
}
/* For each entry in the search array, get the entry */
while (zend_hash_get_current_data(Z_ARRVAL_P(search), (void **) &search_entry) == SUCCESS) {
/* Make sure we're dealing with strings. */
SEPARATE_ZVAL(search_entry);
convert_to_string(*search_entry);
if (Z_STRLEN_PP(search_entry) == 0) {
convert_to_text(*search_entry);
if (Z_UNILEN_PP(search_entry) == 0) {
zend_hash_move_forward(Z_ARRVAL_P(search));
if (Z_TYPE_P(replace) == IS_ARRAY) {
zend_hash_move_forward(Z_ARRVAL_P(replace));
@ -5130,59 +5383,110 @@ static void php_str_replace_in_subject(zval *search, zval *replace, zval **subje
/* Get current entry */
if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
/* Make sure we're dealing with strings. */
convert_to_string_ex(replace_entry);
SEPARATE_ZVAL(replace_entry);
convert_to_text(*replace_entry);
/* Set replacement value to the one we got from array */
replace_value = Z_STRVAL_PP(replace_entry);
replace_len = Z_STRLEN_PP(replace_entry);
replace_value = Z_UNIVAL_PP(replace_entry);
replace_len = Z_UNILEN_PP(replace_entry);
zend_hash_move_forward(Z_ARRVAL_P(replace));
} else {
/* We've run out of replacement strings, so use an empty one. */
replace_value = "";
replace_value = EMPTY_ZSTR;
replace_len = 0;
}
}
if (Z_STRLEN_PP(search_entry) == 1) {
php_char_to_str_ex(Z_STRVAL_P(result),
Z_STRLEN_P(result),
if (Z_UNILEN_PP(search_entry) == 1) {
if (UG(unicode)) {
if (case_sensitivity) {
php_u_char_to_str_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result),
Z_USTRVAL_PP(search_entry)[0],
replace_value.u, replace_len,
&temp_result, replace_count);
} else {
Z_USTRVAL(temp_result) = php_u_str_to_str_case_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result),
Z_USTRVAL_PP(search_entry), Z_USTRLEN_PP(search_entry),
replace_value.u, replace_len,
&Z_USTRLEN(temp_result), replace_count TSRMLS_CC);
}
} else {
php_char_to_str_ex(Z_STRVAL_P(result), Z_STRLEN_P(result),
Z_STRVAL_PP(search_entry)[0],
replace_value,
replace_len,
&temp_result,
case_sensitivity,
replace_count);
} else if (Z_STRLEN_PP(search_entry) > 1) {
replace_value.s, replace_len,
&temp_result, case_sensitivity, replace_count);
}
} else if (Z_UNILEN_PP(search_entry) > 1) {
if (UG(unicode)) {
if (case_sensitivity) {
Z_USTRVAL(temp_result) = php_u_str_to_str_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result),
Z_USTRVAL_PP(search_entry), Z_USTRLEN_PP(search_entry),
replace_value.u, replace_len,
&Z_USTRLEN(temp_result), replace_count);
} else {
Z_USTRVAL(temp_result) = php_u_str_to_str_case_ex(Z_USTRVAL_P(result), Z_USTRLEN_P(result),
Z_USTRVAL_PP(search_entry), Z_USTRLEN_PP(search_entry),
replace_value.u, replace_len,
&Z_USTRLEN(temp_result), replace_count TSRMLS_CC);
}
} else {
Z_STRVAL(temp_result) = php_str_to_str_ex(Z_STRVAL_P(result), Z_STRLEN_P(result),
Z_STRVAL_PP(search_entry), Z_STRLEN_PP(search_entry),
replace_value, replace_len, &Z_STRLEN(temp_result), case_sensitivity, replace_count);
replace_value.s, replace_len,
&Z_STRLEN(temp_result), case_sensitivity, replace_count);
}
}
efree(Z_STRVAL_P(result));
Z_STRVAL_P(result) = Z_STRVAL(temp_result);
Z_STRLEN_P(result) = Z_STRLEN(temp_result);
efree(Z_UNIVAL_P(result).v);
Z_UNIVAL_P(result) = Z_UNIVAL(temp_result);
Z_UNILEN_P(result) = Z_UNILEN(temp_result);
if (Z_STRLEN_P(result) == 0) {
if (Z_UNILEN_P(result) == 0) {
return;
}
zend_hash_move_forward(Z_ARRVAL_P(search));
}
} else {
if (Z_STRLEN_P(search) == 1) {
php_char_to_str_ex(Z_STRVAL_PP(subject),
Z_STRLEN_PP(subject),
if (Z_UNILEN_P(search) == 1) {
if (UG(unicode)) {
if (case_sensitivity) {
php_u_char_to_str_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject),
Z_USTRVAL_P(search)[0],
Z_USTRVAL_P(replace), Z_USTRLEN_P(replace),
result, replace_count);
} else {
Z_USTRVAL_P(result) = php_u_str_to_str_case_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject),
Z_USTRVAL_P(search), Z_USTRLEN_P(search),
Z_USTRVAL_P(replace), Z_USTRLEN_P(replace),
&Z_USTRLEN_P(result), replace_count TSRMLS_CC);
}
} else {
php_char_to_str_ex(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject),
Z_STRVAL_P(search)[0],
Z_STRVAL_P(replace),
Z_STRLEN_P(replace),
result,
case_sensitivity,
replace_count);
Z_STRVAL_P(replace), Z_STRLEN_P(replace),
result, case_sensitivity, replace_count);
}
} else if (Z_STRLEN_P(search) > 1) {
if (UG(unicode)) {
if (case_sensitivity) {
Z_USTRVAL_P(result) = php_u_str_to_str_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject),
Z_USTRVAL_P(search), Z_USTRLEN_P(search),
Z_USTRVAL_P(replace), Z_USTRLEN_P(replace),
&Z_USTRLEN_P(result), replace_count);
} else {
Z_USTRVAL_P(result) = php_u_str_to_str_case_ex(Z_USTRVAL_PP(subject), Z_USTRLEN_PP(subject),
Z_USTRVAL_P(search), Z_USTRLEN_P(search),
Z_USTRVAL_P(replace), Z_USTRLEN_P(replace),
&Z_USTRLEN_P(result), replace_count TSRMLS_CC);
}
} else {
Z_STRVAL_P(result) = php_str_to_str_ex(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject),
Z_STRVAL_P(search), Z_STRLEN_P(search),
Z_STRVAL_P(replace), Z_STRLEN_P(replace), &Z_STRLEN_P(result), case_sensitivity, replace_count);
Z_STRVAL_P(replace), Z_STRLEN_P(replace),
&Z_STRLEN_P(result), case_sensitivity, replace_count);
}
} else {
*result = **subject;
zval_copy_ctor(result);
@ -5196,7 +5500,7 @@ static void php_str_replace_in_subject(zval *search, zval *replace, zval **subje
*/
static void php_str_replace_common(INTERNAL_FUNCTION_PARAMETERS, int case_sensitivity)
{
zval **subject, **search, **replace, **subject_entry, **zcount;
zval *subject, *search, *replace, **subject_entry, *zcount;
zval *result;
zstr string_key;
uint string_key_len;
@ -5204,44 +5508,40 @@ static void php_str_replace_common(INTERNAL_FUNCTION_PARAMETERS, int case_sensit
int count = 0;
int argc = ZEND_NUM_ARGS();
if (argc < 3 || argc > 4 ||
zend_get_parameters_ex(argc, &search, &replace, &subject, &zcount) == FAILURE) {
WRONG_PARAM_COUNT;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z/z/z/|z", &search,
&replace, &subject, &zcount) == FAILURE) {
return;
}
SEPARATE_ZVAL(search);
SEPARATE_ZVAL(replace);
SEPARATE_ZVAL(subject);
/* Make sure we're dealing with strings and do the replacement. */
if (Z_TYPE_PP(search) != IS_ARRAY) {
convert_to_string_ex(search);
convert_to_string_ex(replace);
} else if (Z_TYPE_PP(replace) != IS_ARRAY) {
convert_to_string_ex(replace);
if (Z_TYPE_P(search) != IS_ARRAY) {
convert_to_text(search);
convert_to_text(replace);
} else if (Z_TYPE_P(replace) != IS_ARRAY) {
convert_to_text(replace);
}
/* if subject is an array */
if (Z_TYPE_PP(subject) == IS_ARRAY) {
if (Z_TYPE_P(subject) == IS_ARRAY) {
array_init(return_value);
zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
zend_hash_internal_pointer_reset(Z_ARRVAL_P(subject));
/* For each subject entry, convert it to string, then perform replacement
and add the result to the return_value array. */
while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
while (zend_hash_get_current_data(Z_ARRVAL_P(subject), (void **)&subject_entry) == SUCCESS) {
zend_uchar utype;
if (Z_TYPE_PP(subject_entry) != IS_ARRAY && Z_TYPE_PP(subject_entry) != IS_OBJECT) {
MAKE_STD_ZVAL(result);
SEPARATE_ZVAL(subject_entry);
php_str_replace_in_subject(*search, *replace, subject_entry, result, case_sensitivity, (argc > 3) ? &count : NULL);
php_str_replace_in_subject(search, replace, subject_entry, result, case_sensitivity, (argc > 3) ? &count : NULL TSRMLS_CC);
} else {
ALLOC_ZVAL(result);
ZVAL_ADDREF(*subject_entry);
COPY_PZVAL_TO_ZVAL(*result, *subject_entry);
}
/* Add to return array */
switch ((utype = zend_hash_get_current_key_ex(Z_ARRVAL_PP(subject), &string_key,
switch ((utype = zend_hash_get_current_key_ex(Z_ARRVAL_P(subject), &string_key,
&string_key_len, &num_key, 0, NULL))) {
case HASH_KEY_IS_STRING:
case HASH_KEY_IS_UNICODE:
@ -5253,19 +5553,19 @@ static void php_str_replace_common(INTERNAL_FUNCTION_PARAMETERS, int case_sensit
break;
}
zend_hash_move_forward(Z_ARRVAL_PP(subject));
zend_hash_move_forward(Z_ARRVAL_P(subject));
}
} else { /* if subject is not an array */
php_str_replace_in_subject(*search, *replace, subject, return_value, case_sensitivity, (argc > 3) ? &count : NULL);
php_str_replace_in_subject(search, replace, &subject, return_value, case_sensitivity, (argc > 3) ? &count : NULL TSRMLS_CC);
}
if (argc > 3) {
zval_dtor(*zcount);
ZVAL_LONG(*zcount, count);
zval_dtor(zcount);
ZVAL_LONG(zcount, count);
}
}
/* }}} */
/* {{{ proto mixed str_replace(mixed search, mixed replace, mixed subject [, int &replace_count])
/* {{{ proto mixed str_replace(mixed search, mixed replace, mixed subject [, int &replace_count]) U
Replaces all occurrences of search in haystack with replace */
PHP_FUNCTION(str_replace)
{
@ -5273,7 +5573,7 @@ PHP_FUNCTION(str_replace)
}
/* }}} */
/* {{{ proto mixed str_ireplace(mixed search, mixed replace, mixed subject [, int &replace_count])
/* {{{ proto mixed str_ireplace(mixed search, mixed replace, mixed subject [, int &replace_count]) U
Replaces all occurrences of search in haystack with replace / case-insensitive */
PHP_FUNCTION(str_ireplace)
{

View File

@ -26,29 +26,6 @@ ext/standard
sscanf()
Params API. Rest - no idea yet.
str_replace()
stri_replace()
These are the problematic ones. There are a few approaches:
1. Case-fold both need and haystack and then do simple search.
2. Look at the implementation behind functions like
u_strcasecmp() and try to adapt it to a string search. The
implementation case-folds both strings incrementally. For
a search, one would want to case-fold the pattern beforehand,
but not the text in which you are searching.
3. Take the first character in the pattern and get the set of
all characters that have the same case folding (see the
UnicodeSet/USet API). Then search in the string for the
occurrence of any one of the set items (which include
strings!). Then do a case-insensitive comparison, allowing
a match that does not end with the end of the text.
The problematic cases are of course those ß->ss and similar.
All other approaches bite.
strnatcmp(), strnatcasecmp()
Params API. The rest depends on porting of strnatcmp.c
@ -145,6 +122,8 @@ ext/standard
similar_text()
str_pad()
str_repeat()
str_replace()
stri_replace()
str_rot13()
str_shuffle()
str_split()