mirror of
https://github.com/php/php-src.git
synced 2024-12-04 23:34:25 +08:00
Merge branch 'pull-request/1974' into PHP-5.6
* pull-request/1974: Fix #68447: grapheme_extract take an extra trailing character
This commit is contained in:
commit
8856b3a63c
@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr)
|
||||
static inline int32_t
|
||||
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
|
||||
{
|
||||
int pos = 0, prev_pos = 0;
|
||||
int ret_pos = 0, prev_ret_pos = 0;
|
||||
int pos = 0;
|
||||
int ret_pos = 0;
|
||||
int break_pos, prev_break_pos;
|
||||
int count = 0;
|
||||
|
||||
while ( 1 ) {
|
||||
pos = ubrk_next(bi);
|
||||
@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
|
||||
break;
|
||||
}
|
||||
|
||||
for ( break_pos = ret_pos; break_pos < pos; ) {
|
||||
count++;
|
||||
prev_break_pos = break_pos;
|
||||
U8_FWD_1(pstr, break_pos, str_len);
|
||||
|
||||
if ( prev_break_pos == break_pos ) {
|
||||
/* something wrong - malformed utf8? */
|
||||
csize = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we are beyond our limit, then the loop is done */
|
||||
if ( pos > csize ) {
|
||||
if ( count > csize ) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* update our pointer in the original UTF-8 buffer by as many characters
|
||||
as ubrk_next iterated over */
|
||||
|
||||
prev_ret_pos = ret_pos;
|
||||
U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
|
||||
|
||||
if ( prev_ret_pos == ret_pos ) {
|
||||
/* something wrong - malformed utf8? */
|
||||
break;
|
||||
}
|
||||
|
||||
prev_pos = pos;
|
||||
ret_pos = break_pos;
|
||||
}
|
||||
|
||||
return ret_pos;
|
||||
@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
|
||||
static inline int32_t
|
||||
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
|
||||
{
|
||||
int pos = 0, prev_pos = 0;
|
||||
int ret_pos = 0, prev_ret_pos = 0;
|
||||
int pos = 0;
|
||||
int ret_pos = 0;
|
||||
|
||||
while ( 1 ) {
|
||||
pos = ubrk_next(bi);
|
||||
@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
|
||||
break;
|
||||
}
|
||||
|
||||
prev_ret_pos = ret_pos;
|
||||
U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
|
||||
|
||||
if ( ret_pos > bsize ) {
|
||||
ret_pos = prev_ret_pos;
|
||||
if ( pos > bsize ) {
|
||||
break;
|
||||
}
|
||||
|
||||
if ( prev_ret_pos == ret_pos ) {
|
||||
/* something wrong - malformed utf8? */
|
||||
break;
|
||||
}
|
||||
|
||||
prev_pos = pos;
|
||||
ret_pos = pos;
|
||||
}
|
||||
|
||||
return ret_pos;
|
||||
@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
|
||||
static inline int32_t
|
||||
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
|
||||
{
|
||||
int pos = 0, next_pos = 0;
|
||||
int next_pos = 0;
|
||||
int ret_pos = 0;
|
||||
|
||||
while ( size ) {
|
||||
@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
|
||||
if ( UBRK_DONE == next_pos ) {
|
||||
break;
|
||||
}
|
||||
pos = next_pos;
|
||||
ret_pos = next_pos;
|
||||
size--;
|
||||
}
|
||||
|
||||
/* pos is one past the last UChar - and represent the number of code units to
|
||||
advance in the utf-8 buffer
|
||||
*/
|
||||
|
||||
U8_FWD_N(pstr, ret_pos, str_len, pos);
|
||||
|
||||
return ret_pos;
|
||||
}
|
||||
/* }}} */
|
||||
@ -810,11 +798,11 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
|
||||
Function to extract a sequence of default grapheme clusters */
|
||||
PHP_FUNCTION(grapheme_extract)
|
||||
{
|
||||
unsigned char *str, *pstr;
|
||||
UChar *ustr;
|
||||
int str_len, ustr_len;
|
||||
long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
|
||||
long lstart = 0; /* starting position in str in bytes */
|
||||
char *str, *pstr;
|
||||
UText ut = UTEXT_INITIALIZER;
|
||||
size_t str_len;
|
||||
zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
|
||||
zend_long lstart = 0; /* starting position in str in bytes */
|
||||
int32_t start = 0;
|
||||
long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
|
||||
UErrorCode status;
|
||||
@ -900,21 +888,15 @@ PHP_FUNCTION(grapheme_extract)
|
||||
RETURN_STRINGL(((char *)pstr), nsize, 1);
|
||||
}
|
||||
|
||||
/* convert the strings to UTF-16. */
|
||||
ustr = NULL;
|
||||
ustr_len = 0;
|
||||
status = U_ZERO_ERROR;
|
||||
intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
|
||||
utext_openUTF8(&ut, pstr, str_len, &status);
|
||||
|
||||
if ( U_FAILURE( status ) ) {
|
||||
/* Set global error code. */
|
||||
intl_error_set_code( NULL, status TSRMLS_CC );
|
||||
|
||||
/* Set error messages. */
|
||||
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
|
||||
|
||||
if ( NULL != ustr )
|
||||
efree( ustr );
|
||||
intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
|
||||
|
||||
RETURN_FALSE;
|
||||
}
|
||||
@ -923,8 +905,7 @@ PHP_FUNCTION(grapheme_extract)
|
||||
status = U_ZERO_ERROR;
|
||||
bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
|
||||
|
||||
ubrk_setText(bi, ustr, ustr_len, &status);
|
||||
|
||||
ubrk_setUText(bi, &ut, &status);
|
||||
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
|
||||
can't back up. So, we will not do anything. */
|
||||
|
||||
@ -932,9 +913,7 @@ PHP_FUNCTION(grapheme_extract)
|
||||
|
||||
ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
|
||||
|
||||
if (ustr) {
|
||||
efree(ustr);
|
||||
}
|
||||
utext_close(&ut);
|
||||
ubrk_close(bi);
|
||||
|
||||
if ( NULL != next ) {
|
||||
|
28
ext/intl/tests/bug68447.phpt
Normal file
28
ext/intl/tests/bug68447.phpt
Normal file
@ -0,0 +1,28 @@
|
||||
--TEST--
|
||||
Bug #68447: grapheme_extract take an extra trailing character
|
||||
--SKIPIF--
|
||||
<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?>
|
||||
--FILE--
|
||||
<?php
|
||||
$katsushikaku = "葛󠄁飾区";
|
||||
echo grapheme_extract($katsushikaku, 1) . "\n";
|
||||
|
||||
$haiyore = "這󠄀いよれ";
|
||||
echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n";
|
||||
echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n";
|
||||
echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n";
|
||||
echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n";
|
||||
echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n";
|
||||
echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n";
|
||||
echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n";
|
||||
echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n";
|
||||
--EXPECT--
|
||||
葛󠄁
|
||||
這󠄀
|
||||
這󠄀い
|
||||
|
||||
這󠄀
|
||||
這󠄀い
|
||||
|
||||
這󠄀
|
||||
這󠄀い
|
Loading…
Reference in New Issue
Block a user