mirror of
https://github.com/python/cpython.git
synced 2024-11-24 18:34:43 +08:00
Issue #6561: '\d' in a regular expression should match only Unicode
character category [Nd], not [No].
This commit is contained in:
parent
6bd13fbbc8
commit
1f268285ff
@ -338,11 +338,12 @@ the second character. For example, ``\$`` matches the character ``'$'``.
|
||||
|
||||
``\d``
|
||||
For Unicode (str) patterns:
|
||||
Matches any Unicode digit (which includes ``[0-9]``, and also many
|
||||
other digit characters). If the :const:`ASCII` flag is used only
|
||||
``[0-9]`` is matched (but the flag affects the entire regular
|
||||
expression, so in such cases using an explicit ``[0-9]`` may be a
|
||||
better choice).
|
||||
Matches any Unicode decimal digit (that is, any character in
|
||||
Unicode character category [Nd]). This includes ``[0-9]``, and
|
||||
also many other digit characters. If the :const:`ASCII` flag is
|
||||
used only ``[0-9]`` is matched (but the flag affects the entire
|
||||
regular expression, so in such cases using an explicit ``[0-9]``
|
||||
may be a better choice).
|
||||
For 8-bit (bytes) patterns:
|
||||
Matches any decimal digit; this is equivalent to ``[0-9]``.
|
||||
|
||||
|
@ -605,6 +605,27 @@ class ReTests(unittest.TestCase):
|
||||
self.assertEqual(next(iter).span(), (4, 4))
|
||||
self.assertRaises(StopIteration, next, iter)
|
||||
|
||||
def test_bug_6561(self):
|
||||
# '\d' should match characters in Unicode category 'Nd'
|
||||
# (Number, Decimal Digit), but not those in 'Nl' (Number,
|
||||
# Letter) or 'No' (Number, Other).
|
||||
decimal_digits = [
|
||||
'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
|
||||
'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
|
||||
'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
|
||||
]
|
||||
for x in decimal_digits:
|
||||
self.assertEqual(re.match('^\d$', x).group(0), x)
|
||||
|
||||
not_decimal_digits = [
|
||||
'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
|
||||
'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
|
||||
'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
|
||||
'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
|
||||
]
|
||||
for x in not_decimal_digits:
|
||||
self.assertIsNone(re.match('^\d$', x))
|
||||
|
||||
def test_empty_array(self):
|
||||
# SF buf 1647541
|
||||
import array
|
||||
|
@ -108,6 +108,10 @@ Library
|
||||
Extension Modules
|
||||
-----------------
|
||||
|
||||
- Issue #6561: '\d' in a regex now matches only characters with
|
||||
Unicode category 'Nd' (Number, Decimal Digit). Previously it also
|
||||
matched characters with category 'No'.
|
||||
|
||||
- Issue #4509: Array objects are no longer modified after an operation
|
||||
failing due to the resize restriction in-place when the object has exported
|
||||
buffers.
|
||||
|
@ -168,7 +168,7 @@ static unsigned int sre_lower_locale(unsigned int ch)
|
||||
|
||||
#if defined(HAVE_UNICODE)
|
||||
|
||||
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
|
||||
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
|
||||
|
Loading…
Reference in New Issue
Block a user