gh-80480: array: Add 'w' typecode. (#105242)

2025-01-20 15:34:52 +08:00 · 2023-06-05 01:45:00 +09:00 · 2023-06-05 01:45:00 +09:00 · 1237fb6a4b
commit 1237fb6a4b
parent 5a5ed7a3e6
7 changed files with 159 additions and 59 deletions
--- a/Doc/faq/programming.rst
+++ b/Doc/faq/programming.rst
@ -924,12 +924,12 @@ module::
   'Hello, there!'

   >>> import array
-   >>> a = array.array('u', s)
+   >>> a = array.array('w', s)
   >>> print(a)
-   array('u', 'Hello, world')
+   array('w', 'Hello, world')
   >>> a[0] = 'y'
   >>> print(a)
-   array('u', 'yello, world')
+   array('w', 'yello, world')
   >>> a.tounicode()
   'yello, world'

--- a/Doc/library/array.rst
+++ b/Doc/library/array.rst
@ -24,6 +24,8 @@ defined:
 +-----------+--------------------+-------------------+-----------------------+-------+
 | ``'u'``   | wchar_t            | Unicode character | 2                     | \(1)  |
 +-----------+--------------------+-------------------+-----------------------+-------+
+| ``'w'``   | Py_UCS4            | Unicode character | 4                     |       |
+-----------+--------------------+-------------------+-----------------------+-------+
 | ``'h'``   | signed short       | int               | 2                     |       |
 +-----------+--------------------+-------------------+-----------------------+-------+
 | ``'H'``   | unsigned short     | int               | 2                     |       |
@ -56,6 +58,7 @@ Notes:
      ``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3.

   .. deprecated-removed:: 3.3 4.0
+      Please migrate to ``'w'`` typecode.


 The actual representation of values is determined by the machine architecture
@ -174,9 +177,9 @@ The module defines the following type:

   .. method:: fromunicode(s)

-      Extends this array with data from the given unicode string.  The array must
-      be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised.  Use
-      ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
+      Extends this array with data from the given unicode string.
+      The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised.
+      Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
      array of some other type.


@ -236,21 +239,22 @@ The module defines the following type:

   .. method:: tounicode()

-      Convert the array to a unicode string.  The array must be a type ``'u'`` array;
+      Convert the array to a unicode string.  The array must have a type ``'u'`` or ``'w'``;
      otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to
      obtain a unicode string from an array of some other type.


 When an array object is printed or converted to a string, it is represented as
 ``array(typecode, initializer)``.  The *initializer* is omitted if the array is
-empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a
-list of numbers.  The string is guaranteed to be able to be converted back to an
+empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``,
+otherwise it is a list of numbers.
+The string is guaranteed to be able to be converted back to an
 array with the same type and value using :func:`eval`, so long as the
 :class:`~array.array` class has been imported using ``from array import array``.
 Examples::

   array('l')
-   array('u', 'hello \u2641')
+   array('w', 'hello \u2641')
   array('l', [1, 2, 3, 4, 5])
   array('d', [1.0, 2.0, 3.14])

--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@ -87,6 +87,13 @@ New Modules
 Improved Modules
 ================

+array
+-----
+
+* Add ``'w'`` type code that can be used for Unicode strings.
+  It can be used instead of ``'u'`` type code, which is deprecated.
+  (Contributed by Inada Naoki in :gh:`80480`.)
+
 io
 --

--- a/Lib/test/test_array.py
+++ b/Lib/test/test_array.py
@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array):
    def __init__(self, typecode, newarg=None):
        array.array.__init__(self)

-typecodes = 'ubBhHiIlLfdqQ'
+typecodes = 'uwbBhHiIlLfdqQ'

 class MiscTest(unittest.TestCase):

@ -186,9 +186,10 @@ class ArrayReconstructorTest(unittest.TestCase):
        )
        for testcase in testcases:
            mformat_code, encoding = testcase
-            a = array.array('u', teststr)
+            for c in 'uw':
+                a = array.array(c, teststr)
                b = array_reconstructor(
-                array.array, 'u', mformat_code, teststr.encode(encoding))
+                    array.array, c, mformat_code, teststr.encode(encoding))
                self.assertEqual(a, b,
                    msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))

@ -234,7 +235,7 @@ class BaseTest:
        self.assertEqual(bi[1], len(a))

    def test_byteswap(self):
-        if self.typecode == 'u':
+        if self.typecode in ('u', 'w'):
            example = '\U00100100'
        else:
            example = self.example
@ -1079,7 +1080,7 @@ class BaseTest:
        self.assertEqual(m.tobytes(), expected)
        self.assertRaises(BufferError, a.frombytes, a.tobytes())
        self.assertEqual(m.tobytes(), expected)
-        if self.typecode == 'u':
+        if self.typecode in ('u', 'w'):
            self.assertRaises(BufferError, a.fromunicode, a.tounicode())
            self.assertEqual(m.tobytes(), expected)
        self.assertRaises(BufferError, operator.imul, a, 2)
@ -1135,16 +1136,17 @@ class BaseTest:
        support.check_sizeof(self, a, basesize)

    def test_initialize_with_unicode(self):
-        if self.typecode != 'u':
+        if self.typecode not in ('u', 'w'):
            with self.assertRaises(TypeError) as cm:
                a = array.array(self.typecode, 'foo')
            self.assertIn("cannot use a str", str(cm.exception))
            with self.assertRaises(TypeError) as cm:
-                a = array.array(self.typecode, array.array('u', 'foo'))
+                a = array.array(self.typecode, array.array('w', 'foo'))
            self.assertIn("cannot use a unicode array", str(cm.exception))
        else:
            a = array.array(self.typecode, "foo")
            a = array.array(self.typecode, array.array('u', 'foo'))
+            a = array.array(self.typecode, array.array('w', 'foo'))

    @support.cpython_only
    def test_obsolete_write_lock(self):
@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase):
    smallerexample = '\x01\u263a\x00\ufefe'
    biggerexample = '\x01\u263a\x01\ufeff'
    outside = str('\x33')
-    minitemsize = 2
+    minitemsize = sizeof_wchar

    def test_unicode(self):
        self.assertRaises(TypeError, array.array, 'b', 'foo')

-        a = array.array('u', '\xa0\xc2\u1234')
+        a = array.array(self.typecode, '\xa0\xc2\u1234')
        a.fromunicode(' ')
        a.fromunicode('')
        a.fromunicode('')
        a.fromunicode('\x11abc\xff\u1234')
        s = a.tounicode()
        self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234')
-        self.assertEqual(a.itemsize, sizeof_wchar)
+        self.assertEqual(a.itemsize, self.minitemsize)

        s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234'
-        a = array.array('u', s)
+        a = array.array(self.typecode, s)
        self.assertEqual(
            repr(a),
-            "array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
+            f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")

        self.assertRaises(TypeError, a.fromunicode)

    def test_issue17223(self):
-        # this used to crash
-        if sizeof_wchar == 4:
-            # U+FFFFFFFF is an invalid code point in Unicode 6.0
-            invalid_str = b'\xff\xff\xff\xff'
-        else:
+        if self.typecode == 'u' and sizeof_wchar == 2:
            # PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
            self.skipTest("specific to 32-bit wchar_t")
-        a = array.array('u', invalid_str)
+
+        # this used to crash
+        # U+FFFFFFFF is an invalid code point in Unicode 6.0
+        invalid_str = b'\xff\xff\xff\xff'
+
+        a = array.array(self.typecode, invalid_str)
        self.assertRaises(ValueError, a.tounicode)
        self.assertRaises(ValueError, str, a)

+class UCS4Test(UnicodeTest):
+    typecode = 'w'
+    minitemsize = 4
+
 class NumberTest(BaseTest):

    def test_extslice(self):
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@ -955,7 +955,7 @@ class TestArrayWrites(unittest.TestCase):

    def test_char_write(self):
        import array, string
-        a = array.array('u', string.ascii_letters)
+        a = array.array('w', string.ascii_letters)

        with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
            writer = csv.writer(fileobj, dialect="excel")
--- a/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst
+++ b/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst
@ -0,0 +1 @@
+:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.
--- a/Modules/arraymodule.c
+++ b/Modules/arraymodule.c
@ -13,6 +13,7 @@
 #include "pycore_bytesobject.h"   // _PyBytes_Repeat
 #include "structmember.h"         // PyMemberDef
 #include <stddef.h>               // offsetof()
+#include <stdbool.h>

 /*[clinic input]
 module array
@ -279,6 +280,31 @@ u_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
    return 0;
 }

+static PyObject *
+w_getitem(arrayobject *ap, Py_ssize_t i)
+{
+    return PyUnicode_FromOrdinal(((Py_UCS4 *) ap->ob_item)[i]);
+}
+
+static int
+w_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
+{
+    PyObject *u;
+    if (!PyArg_Parse(v, "U;array item must be unicode character", &u)) {
+        return -1;
+    }
+
+    if (PyUnicode_GetLength(u) != 1) {
+        PyErr_SetString(PyExc_TypeError,
+                        "array item must be unicode character");
+        return -1;
+    }
+
+    if (i >= 0) {
+        ((Py_UCS4 *)ap->ob_item)[i] = PyUnicode_READ_CHAR(u, 0);
+    }
+    return 0;
+}

 static PyObject *
 h_getitem(arrayobject *ap, Py_ssize_t i)
@ -543,6 +569,7 @@ d_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
 DEFINE_COMPAREITEMS(b, signed char)
 DEFINE_COMPAREITEMS(BB, unsigned char)
 DEFINE_COMPAREITEMS(u, wchar_t)
+DEFINE_COMPAREITEMS(w, Py_UCS4)
 DEFINE_COMPAREITEMS(h, short)
 DEFINE_COMPAREITEMS(HH, unsigned short)
 DEFINE_COMPAREITEMS(i, int)
@ -561,6 +588,7 @@ static const struct arraydescr descriptors[] = {
    {'b', 1, b_getitem, b_setitem, b_compareitems, "b", 1, 1},
    {'B', 1, BB_getitem, BB_setitem, BB_compareitems, "B", 1, 0},
    {'u', sizeof(wchar_t), u_getitem, u_setitem, u_compareitems, "u", 0, 0},
+    {'w', sizeof(Py_UCS4), w_getitem, w_setitem, w_compareitems, "w", 0, 0,},
    {'h', sizeof(short), h_getitem, h_setitem, h_compareitems, "h", 1, 1},
    {'H', sizeof(short), HH_getitem, HH_setitem, HH_compareitems, "H", 1, 0},
    {'i', sizeof(int), i_getitem, i_setitem, i_compareitems, "i", 1, 1},
@ -1716,13 +1744,15 @@ static PyObject *
 array_array_fromunicode_impl(arrayobject *self, PyObject *ustr)
 /*[clinic end generated code: output=24359f5e001a7f2b input=025db1fdade7a4ce]*/
 {
-    if (self->ob_descr->typecode != 'u') {
+    int typecode = self->ob_descr->typecode;
+    if (typecode != 'u' && typecode != 'w') {
        PyErr_SetString(PyExc_ValueError,
            "fromunicode() may only be called on "
-            "unicode type arrays");
+            "unicode type arrays ('u' or 'w')");
        return NULL;
    }

+    if (typecode == 'u') {
        Py_ssize_t ustr_length = PyUnicode_AsWideChar(ustr, NULL, 0);
        assert(ustr_length > 0);
        if (ustr_length > 1) {
@ -1736,6 +1766,25 @@ array_array_fromunicode_impl(arrayobject *self, PyObject *ustr)
            PyUnicode_AsWideChar(
                ustr, ((wchar_t *)self->ob_item) + old_size, ustr_length);
        }
+    }
+    else { // typecode == 'w'
+        Py_ssize_t ustr_length = PyUnicode_GetLength(ustr);
+        Py_ssize_t old_size = Py_SIZE(self);
+        Py_ssize_t new_size = old_size + ustr_length;
+
+        if (new_size < 0 || (size_t)new_size > PY_SSIZE_T_MAX / sizeof(Py_UCS4)) {
+            return PyErr_NoMemory();
+        }
+        if (array_resize(self, new_size) == -1) {
+            return NULL;
+        }
+
+        // must not fail
+        Py_UCS4 *u = PyUnicode_AsUCS4(ustr, ((Py_UCS4*)self->ob_item) + old_size,
+                                      ustr_length, 0);
+        assert(u != NULL);
+        (void)u; // Suppress unused_variable warning.
+    }

    Py_RETURN_NONE;
 }
@ -1754,12 +1803,20 @@ static PyObject *
 array_array_tounicode_impl(arrayobject *self)
 /*[clinic end generated code: output=08e442378336e1ef input=127242eebe70b66d]*/
 {
-    if (self->ob_descr->typecode != 'u') {
+    int typecode = self->ob_descr->typecode;
+    if (typecode != 'u' && typecode != 'w') {
        PyErr_SetString(PyExc_ValueError,
-             "tounicode() may only be called on unicode type arrays");
+             "tounicode() may only be called on unicode type arrays ('u' or 'w')");
        return NULL;
    }
+    if (typecode == 'u') {
        return PyUnicode_FromWideChar((wchar_t *) self->ob_item, Py_SIZE(self));
+    }
+    else { // typecode == 'w'
+        int byteorder = 0; // native byteorder
+        return PyUnicode_DecodeUTF32((const char *) self->ob_item, Py_SIZE(self) * 4,
+                                     NULL, &byteorder);
+    }
 }

 /*[clinic input]
@ -1838,6 +1895,9 @@ typecode_to_mformat_code(char typecode)
        }
        return UNKNOWN_FORMAT;

+    case 'w':
+        return UTF32_LE + is_big_endian;
+
    case 'f':
        if (sizeof(float) == 4) {
            const float y = 16711938.0;
@ -2314,7 +2374,7 @@ array_repr(arrayobject *a)
        return PyUnicode_FromFormat("%s('%c')",
                                    _PyType_Name(Py_TYPE(a)), (int)typecode);
    }
-    if (typecode == 'u') {
+    if (typecode == 'u' || typecode == 'w') {
        v = array_array_tounicode_impl(a);
    } else {
        v = array_array_tolist_impl(a);
@ -2619,25 +2679,29 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
        return NULL;
    }

-    if (initial && c != 'u') {
+    bool is_unicode = c == 'u' || c == 'w';
+
+    if (initial && !is_unicode) {
        if (PyUnicode_Check(initial)) {
            PyErr_Format(PyExc_TypeError, "cannot use a str to initialize "
                         "an array with typecode '%c'", c);
            return NULL;
        }
-        else if (array_Check(initial, state) &&
-                 ((arrayobject*)initial)->ob_descr->typecode == 'u') {
+        else if (array_Check(initial, state)) {
+            int ic = ((arrayobject*)initial)->ob_descr->typecode;
+            if (ic == 'u' || ic == 'w') {
                PyErr_Format(PyExc_TypeError, "cannot use a unicode array to "
                            "initialize an array with typecode '%c'", c);
                return NULL;
            }
        }
+    }

    if (!(initial == NULL || PyList_Check(initial)
          || PyByteArray_Check(initial)
          || PyBytes_Check(initial)
          || PyTuple_Check(initial)
-          || ((c=='u') && PyUnicode_Check(initial))
+          || (is_unicode && PyUnicode_Check(initial))
          || (array_Check(initial, state)
              && c == ((arrayobject*)initial)->ob_descr->typecode))) {
        it = PyObject_GetIter(initial);
@ -2697,6 +2761,7 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
                Py_DECREF(v);
            }
            else if (initial != NULL && PyUnicode_Check(initial))  {
+                if (c == 'u') {
                    Py_ssize_t n;
                    wchar_t *ustr = PyUnicode_AsWideCharString(initial, &n);
                    if (ustr == NULL) {
@ -2713,6 +2778,22 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
                        self->allocated = n;
                    }
                }
+                else { // c == 'w'
+                    Py_ssize_t n = PyUnicode_GET_LENGTH(initial);
+                    Py_UCS4 *ustr = PyUnicode_AsUCS4Copy(initial);
+                    if (ustr == NULL) {
+                        Py_DECREF(a);
+                        return NULL;
+                    }
+
+                    arrayobject *self = (arrayobject *)a;
+                    // self->ob_item may be NULL but it is safe.
+                    PyMem_Free(self->ob_item);
+                    self->ob_item = (char *)ustr;
+                    Py_SET_SIZE(self, n);
+                    self->allocated = n;
+                }
+            }
            else if (initial != NULL && array_Check(initial, state) && len > 0) {
                arrayobject *self = (arrayobject *)a;
                arrayobject *other = (arrayobject *)initial;
				`@ -0,0 +1 @@`
				:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.