Merge branch 'pull-request/1974' into PHP-5.6

* pull-request/1974:
  Fix #68447: grapheme_extract take an extra trailing character
This commit is contained in:
Stanislav Malyshev 2016-11-27 15:11:17 -08:00
commit 8856b3a63c
2 changed files with 61 additions and 54 deletions

View File

@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr)
static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
{
int pos = 0, prev_pos = 0;
int ret_pos = 0, prev_ret_pos = 0;
int pos = 0;
int ret_pos = 0;
int break_pos, prev_break_pos;
int count = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
break;
}
for ( break_pos = ret_pos; break_pos < pos; ) {
count++;
prev_break_pos = break_pos;
U8_FWD_1(pstr, break_pos, str_len);
if ( prev_break_pos == break_pos ) {
/* something wrong - malformed utf8? */
csize = 0;
break;
}
}
/* if we are beyond our limit, then the loop is done */
if ( pos > csize ) {
if ( count > csize ) {
break;
}
/* update our pointer in the original UTF-8 buffer by as many characters
as ubrk_next iterated over */
prev_ret_pos = ret_pos;
U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
if ( prev_ret_pos == ret_pos ) {
/* something wrong - malformed utf8? */
break;
}
prev_pos = pos;
ret_pos = break_pos;
}
return ret_pos;
@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
{
int pos = 0, prev_pos = 0;
int ret_pos = 0, prev_ret_pos = 0;
int pos = 0;
int ret_pos = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
break;
}
prev_ret_pos = ret_pos;
U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
if ( ret_pos > bsize ) {
ret_pos = prev_ret_pos;
if ( pos > bsize ) {
break;
}
if ( prev_ret_pos == ret_pos ) {
/* something wrong - malformed utf8? */
break;
}
prev_pos = pos;
ret_pos = pos;
}
return ret_pos;
@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
static inline int32_t
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
{
int pos = 0, next_pos = 0;
int next_pos = 0;
int ret_pos = 0;
while ( size ) {
@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
if ( UBRK_DONE == next_pos ) {
break;
}
pos = next_pos;
ret_pos = next_pos;
size--;
}
/* pos is one past the last UChar - and represent the number of code units to
advance in the utf-8 buffer
*/
U8_FWD_N(pstr, ret_pos, str_len, pos);
return ret_pos;
}
/* }}} */
@ -810,11 +798,11 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)
{
unsigned char *str, *pstr;
UChar *ustr;
int str_len, ustr_len;
long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
long lstart = 0; /* starting position in str in bytes */
char *str, *pstr;
UText ut = UTEXT_INITIALIZER;
size_t str_len;
zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
zend_long lstart = 0; /* starting position in str in bytes */
int32_t start = 0;
long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
UErrorCode status;
@ -900,21 +888,15 @@ PHP_FUNCTION(grapheme_extract)
RETURN_STRINGL(((char *)pstr), nsize, 1);
}
/* convert the strings to UTF-16. */
ustr = NULL;
ustr_len = 0;
status = U_ZERO_ERROR;
intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
utext_openUTF8(&ut, pstr, str_len, &status);
if ( U_FAILURE( status ) ) {
/* Set global error code. */
intl_error_set_code( NULL, status TSRMLS_CC );
/* Set error messages. */
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
if ( NULL != ustr )
efree( ustr );
intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
RETURN_FALSE;
}
@ -923,8 +905,7 @@ PHP_FUNCTION(grapheme_extract)
status = U_ZERO_ERROR;
bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
ubrk_setText(bi, ustr, ustr_len, &status);
ubrk_setUText(bi, &ut, &status);
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
can't back up. So, we will not do anything. */
@ -932,9 +913,7 @@ PHP_FUNCTION(grapheme_extract)
ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
if (ustr) {
efree(ustr);
}
utext_close(&ut);
ubrk_close(bi);
if ( NULL != next ) {

View File

@ -0,0 +1,28 @@
--TEST--
Bug #68447: grapheme_extract take an extra trailing character
--SKIPIF--
<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?>
--FILE--
<?php
$katsushikaku = "葛󠄁飾区";
echo grapheme_extract($katsushikaku, 1) . "\n";
$haiyore = "這󠄀いよれ";
echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n";
echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n";
echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n";
echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n";
echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n";
echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n";
echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n";
echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n";
--EXPECT--
葛󠄁
這󠄀
這󠄀い
這󠄀
這󠄀い
這󠄀
這󠄀い