diff --git a/Doc/faq/programming.rst b/Doc/faq/programming.rst index ab5618db84f..6e1812504a1 100644 --- a/Doc/faq/programming.rst +++ b/Doc/faq/programming.rst @@ -924,12 +924,12 @@ module:: 'Hello, there!' >>> import array - >>> a = array.array('u', s) + >>> a = array.array('w', s) >>> print(a) - array('u', 'Hello, world') + array('w', 'Hello, world') >>> a[0] = 'y' >>> print(a) - array('u', 'yello, world') + array('w', 'yello, world') >>> a.tounicode() 'yello, world' diff --git a/Doc/library/array.rst b/Doc/library/array.rst index 1f8fec6ea55..1f5810b35d2 100644 --- a/Doc/library/array.rst +++ b/Doc/library/array.rst @@ -24,6 +24,8 @@ defined: +-----------+--------------------+-------------------+-----------------------+-------+ | ``'u'`` | wchar_t | Unicode character | 2 | \(1) | +-----------+--------------------+-------------------+-----------------------+-------+ +| ``'w'`` | Py_UCS4 | Unicode character | 4 | | ++-----------+--------------------+-------------------+-----------------------+-------+ | ``'h'`` | signed short | int | 2 | | +-----------+--------------------+-------------------+-----------------------+-------+ | ``'H'`` | unsigned short | int | 2 | | @@ -56,6 +58,7 @@ Notes: ``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3. .. deprecated-removed:: 3.3 4.0 + Please migrate to ``'w'`` typecode. The actual representation of values is determined by the machine architecture @@ -174,9 +177,9 @@ The module defines the following type: .. method:: fromunicode(s) - Extends this array with data from the given unicode string. The array must - be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised. Use - ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an + Extends this array with data from the given unicode string. + The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised. + Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an array of some other type. @@ -236,21 +239,22 @@ The module defines the following type: .. method:: tounicode() - Convert the array to a unicode string. The array must be a type ``'u'`` array; + Convert the array to a unicode string. The array must have a type ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to obtain a unicode string from an array of some other type. When an array object is printed or converted to a string, it is represented as ``array(typecode, initializer)``. The *initializer* is omitted if the array is -empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a -list of numbers. The string is guaranteed to be able to be converted back to an +empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``, +otherwise it is a list of numbers. +The string is guaranteed to be able to be converted back to an array with the same type and value using :func:`eval`, so long as the :class:`~array.array` class has been imported using ``from array import array``. Examples:: array('l') - array('u', 'hello \u2641') + array('w', 'hello \u2641') array('l', [1, 2, 3, 4, 5]) array('d', [1.0, 2.0, 3.14]) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 04ac34e0926..ff7772ef1ff 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -87,6 +87,13 @@ New Modules Improved Modules ================ +array +----- + +* Add ``'w'`` type code that can be used for Unicode strings. + It can be used instead of ``'u'`` type code, which is deprecated. + (Contributed by Inada Naoki in :gh:`80480`.) + io -- diff --git a/Lib/test/test_array.py b/Lib/test/test_array.py index 5b2c107a604..a94d04f6515 100755 --- a/Lib/test/test_array.py +++ b/Lib/test/test_array.py @@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array): def __init__(self, typecode, newarg=None): array.array.__init__(self) -typecodes = 'ubBhHiIlLfdqQ' +typecodes = 'uwbBhHiIlLfdqQ' class MiscTest(unittest.TestCase): @@ -186,11 +186,12 @@ class ArrayReconstructorTest(unittest.TestCase): ) for testcase in testcases: mformat_code, encoding = testcase - a = array.array('u', teststr) - b = array_reconstructor( - array.array, 'u', mformat_code, teststr.encode(encoding)) - self.assertEqual(a, b, - msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase)) + for c in 'uw': + a = array.array(c, teststr) + b = array_reconstructor( + array.array, c, mformat_code, teststr.encode(encoding)) + self.assertEqual(a, b, + msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase)) class BaseTest: @@ -234,7 +235,7 @@ class BaseTest: self.assertEqual(bi[1], len(a)) def test_byteswap(self): - if self.typecode == 'u': + if self.typecode in ('u', 'w'): example = '\U00100100' else: example = self.example @@ -1079,7 +1080,7 @@ class BaseTest: self.assertEqual(m.tobytes(), expected) self.assertRaises(BufferError, a.frombytes, a.tobytes()) self.assertEqual(m.tobytes(), expected) - if self.typecode == 'u': + if self.typecode in ('u', 'w'): self.assertRaises(BufferError, a.fromunicode, a.tounicode()) self.assertEqual(m.tobytes(), expected) self.assertRaises(BufferError, operator.imul, a, 2) @@ -1135,16 +1136,17 @@ class BaseTest: support.check_sizeof(self, a, basesize) def test_initialize_with_unicode(self): - if self.typecode != 'u': + if self.typecode not in ('u', 'w'): with self.assertRaises(TypeError) as cm: a = array.array(self.typecode, 'foo') self.assertIn("cannot use a str", str(cm.exception)) with self.assertRaises(TypeError) as cm: - a = array.array(self.typecode, array.array('u', 'foo')) + a = array.array(self.typecode, array.array('w', 'foo')) self.assertIn("cannot use a unicode array", str(cm.exception)) else: a = array.array(self.typecode, "foo") a = array.array(self.typecode, array.array('u', 'foo')) + a = array.array(self.typecode, array.array('w', 'foo')) @support.cpython_only def test_obsolete_write_lock(self): @@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase): smallerexample = '\x01\u263a\x00\ufefe' biggerexample = '\x01\u263a\x01\ufeff' outside = str('\x33') - minitemsize = 2 + minitemsize = sizeof_wchar def test_unicode(self): self.assertRaises(TypeError, array.array, 'b', 'foo') - a = array.array('u', '\xa0\xc2\u1234') + a = array.array(self.typecode, '\xa0\xc2\u1234') a.fromunicode(' ') a.fromunicode('') a.fromunicode('') a.fromunicode('\x11abc\xff\u1234') s = a.tounicode() self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234') - self.assertEqual(a.itemsize, sizeof_wchar) + self.assertEqual(a.itemsize, self.minitemsize) s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234' - a = array.array('u', s) + a = array.array(self.typecode, s) self.assertEqual( repr(a), - "array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')") + f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')") self.assertRaises(TypeError, a.fromunicode) def test_issue17223(self): - # this used to crash - if sizeof_wchar == 4: - # U+FFFFFFFF is an invalid code point in Unicode 6.0 - invalid_str = b'\xff\xff\xff\xff' - else: + if self.typecode == 'u' and sizeof_wchar == 2: # PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t self.skipTest("specific to 32-bit wchar_t") - a = array.array('u', invalid_str) + + # this used to crash + # U+FFFFFFFF is an invalid code point in Unicode 6.0 + invalid_str = b'\xff\xff\xff\xff' + + a = array.array(self.typecode, invalid_str) self.assertRaises(ValueError, a.tounicode) self.assertRaises(ValueError, str, a) +class UCS4Test(UnicodeTest): + typecode = 'w' + minitemsize = 4 + class NumberTest(BaseTest): def test_extslice(self): diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index de7ac97d72c..6a4180e6d1b 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -955,7 +955,7 @@ class TestArrayWrites(unittest.TestCase): def test_char_write(self): import array, string - a = array.array('u', string.ascii_letters) + a = array.array('w', string.ascii_letters) with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj: writer = csv.writer(fileobj, dialect="excel") diff --git a/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst b/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst new file mode 100644 index 00000000000..fd87efe9bde --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst @@ -0,0 +1 @@ +:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``. diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c index 999b848f9ad..16e3739eb26 100644 --- a/Modules/arraymodule.c +++ b/Modules/arraymodule.c @@ -13,6 +13,7 @@ #include "pycore_bytesobject.h" // _PyBytes_Repeat #include "structmember.h" // PyMemberDef #include // offsetof() +#include /*[clinic input] module array @@ -279,6 +280,31 @@ u_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v) return 0; } +static PyObject * +w_getitem(arrayobject *ap, Py_ssize_t i) +{ + return PyUnicode_FromOrdinal(((Py_UCS4 *) ap->ob_item)[i]); +} + +static int +w_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v) +{ + PyObject *u; + if (!PyArg_Parse(v, "U;array item must be unicode character", &u)) { + return -1; + } + + if (PyUnicode_GetLength(u) != 1) { + PyErr_SetString(PyExc_TypeError, + "array item must be unicode character"); + return -1; + } + + if (i >= 0) { + ((Py_UCS4 *)ap->ob_item)[i] = PyUnicode_READ_CHAR(u, 0); + } + return 0; +} static PyObject * h_getitem(arrayobject *ap, Py_ssize_t i) @@ -543,6 +569,7 @@ d_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v) DEFINE_COMPAREITEMS(b, signed char) DEFINE_COMPAREITEMS(BB, unsigned char) DEFINE_COMPAREITEMS(u, wchar_t) +DEFINE_COMPAREITEMS(w, Py_UCS4) DEFINE_COMPAREITEMS(h, short) DEFINE_COMPAREITEMS(HH, unsigned short) DEFINE_COMPAREITEMS(i, int) @@ -561,6 +588,7 @@ static const struct arraydescr descriptors[] = { {'b', 1, b_getitem, b_setitem, b_compareitems, "b", 1, 1}, {'B', 1, BB_getitem, BB_setitem, BB_compareitems, "B", 1, 0}, {'u', sizeof(wchar_t), u_getitem, u_setitem, u_compareitems, "u", 0, 0}, + {'w', sizeof(Py_UCS4), w_getitem, w_setitem, w_compareitems, "w", 0, 0,}, {'h', sizeof(short), h_getitem, h_setitem, h_compareitems, "h", 1, 1}, {'H', sizeof(short), HH_getitem, HH_setitem, HH_compareitems, "H", 1, 0}, {'i', sizeof(int), i_getitem, i_setitem, i_compareitems, "i", 1, 1}, @@ -1716,25 +1744,46 @@ static PyObject * array_array_fromunicode_impl(arrayobject *self, PyObject *ustr) /*[clinic end generated code: output=24359f5e001a7f2b input=025db1fdade7a4ce]*/ { - if (self->ob_descr->typecode != 'u') { + int typecode = self->ob_descr->typecode; + if (typecode != 'u' && typecode != 'w') { PyErr_SetString(PyExc_ValueError, "fromunicode() may only be called on " - "unicode type arrays"); + "unicode type arrays ('u' or 'w')"); return NULL; } - Py_ssize_t ustr_length = PyUnicode_AsWideChar(ustr, NULL, 0); - assert(ustr_length > 0); - if (ustr_length > 1) { - ustr_length--; /* trim trailing NUL character */ + if (typecode == 'u') { + Py_ssize_t ustr_length = PyUnicode_AsWideChar(ustr, NULL, 0); + assert(ustr_length > 0); + if (ustr_length > 1) { + ustr_length--; /* trim trailing NUL character */ + Py_ssize_t old_size = Py_SIZE(self); + if (array_resize(self, old_size + ustr_length) == -1) { + return NULL; + } + + // must not fail + PyUnicode_AsWideChar( + ustr, ((wchar_t *)self->ob_item) + old_size, ustr_length); + } + } + else { // typecode == 'w' + Py_ssize_t ustr_length = PyUnicode_GetLength(ustr); Py_ssize_t old_size = Py_SIZE(self); - if (array_resize(self, old_size + ustr_length) == -1) { + Py_ssize_t new_size = old_size + ustr_length; + + if (new_size < 0 || (size_t)new_size > PY_SSIZE_T_MAX / sizeof(Py_UCS4)) { + return PyErr_NoMemory(); + } + if (array_resize(self, new_size) == -1) { return NULL; } // must not fail - PyUnicode_AsWideChar( - ustr, ((wchar_t *)self->ob_item) + old_size, ustr_length); + Py_UCS4 *u = PyUnicode_AsUCS4(ustr, ((Py_UCS4*)self->ob_item) + old_size, + ustr_length, 0); + assert(u != NULL); + (void)u; // Suppress unused_variable warning. } Py_RETURN_NONE; @@ -1754,12 +1803,20 @@ static PyObject * array_array_tounicode_impl(arrayobject *self) /*[clinic end generated code: output=08e442378336e1ef input=127242eebe70b66d]*/ { - if (self->ob_descr->typecode != 'u') { + int typecode = self->ob_descr->typecode; + if (typecode != 'u' && typecode != 'w') { PyErr_SetString(PyExc_ValueError, - "tounicode() may only be called on unicode type arrays"); + "tounicode() may only be called on unicode type arrays ('u' or 'w')"); return NULL; } - return PyUnicode_FromWideChar((wchar_t *) self->ob_item, Py_SIZE(self)); + if (typecode == 'u') { + return PyUnicode_FromWideChar((wchar_t *) self->ob_item, Py_SIZE(self)); + } + else { // typecode == 'w' + int byteorder = 0; // native byteorder + return PyUnicode_DecodeUTF32((const char *) self->ob_item, Py_SIZE(self) * 4, + NULL, &byteorder); + } } /*[clinic input] @@ -1838,6 +1895,9 @@ typecode_to_mformat_code(char typecode) } return UNKNOWN_FORMAT; + case 'w': + return UTF32_LE + is_big_endian; + case 'f': if (sizeof(float) == 4) { const float y = 16711938.0; @@ -2314,7 +2374,7 @@ array_repr(arrayobject *a) return PyUnicode_FromFormat("%s('%c')", _PyType_Name(Py_TYPE(a)), (int)typecode); } - if (typecode == 'u') { + if (typecode == 'u' || typecode == 'w') { v = array_array_tounicode_impl(a); } else { v = array_array_tolist_impl(a); @@ -2619,17 +2679,21 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return NULL; } - if (initial && c != 'u') { + bool is_unicode = c == 'u' || c == 'w'; + + if (initial && !is_unicode) { if (PyUnicode_Check(initial)) { PyErr_Format(PyExc_TypeError, "cannot use a str to initialize " "an array with typecode '%c'", c); return NULL; } - else if (array_Check(initial, state) && - ((arrayobject*)initial)->ob_descr->typecode == 'u') { - PyErr_Format(PyExc_TypeError, "cannot use a unicode array to " - "initialize an array with typecode '%c'", c); - return NULL; + else if (array_Check(initial, state)) { + int ic = ((arrayobject*)initial)->ob_descr->typecode; + if (ic == 'u' || ic == 'w') { + PyErr_Format(PyExc_TypeError, "cannot use a unicode array to " + "initialize an array with typecode '%c'", c); + return NULL; + } } } @@ -2637,7 +2701,7 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) || PyByteArray_Check(initial) || PyBytes_Check(initial) || PyTuple_Check(initial) - || ((c=='u') && PyUnicode_Check(initial)) + || (is_unicode && PyUnicode_Check(initial)) || (array_Check(initial, state) && c == ((arrayobject*)initial)->ob_descr->typecode))) { it = PyObject_GetIter(initial); @@ -2697,14 +2761,31 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) Py_DECREF(v); } else if (initial != NULL && PyUnicode_Check(initial)) { - Py_ssize_t n; - wchar_t *ustr = PyUnicode_AsWideCharString(initial, &n); - if (ustr == NULL) { - Py_DECREF(a); - return NULL; - } + if (c == 'u') { + Py_ssize_t n; + wchar_t *ustr = PyUnicode_AsWideCharString(initial, &n); + if (ustr == NULL) { + Py_DECREF(a); + return NULL; + } + + if (n > 0) { + arrayobject *self = (arrayobject *)a; + // self->ob_item may be NULL but it is safe. + PyMem_Free(self->ob_item); + self->ob_item = (char *)ustr; + Py_SET_SIZE(self, n); + self->allocated = n; + } + } + else { // c == 'w' + Py_ssize_t n = PyUnicode_GET_LENGTH(initial); + Py_UCS4 *ustr = PyUnicode_AsUCS4Copy(initial); + if (ustr == NULL) { + Py_DECREF(a); + return NULL; + } - if (n > 0) { arrayobject *self = (arrayobject *)a; // self->ob_item may be NULL but it is safe. PyMem_Free(self->ob_item);