Issue #10557: Fixed error messages from float() and other numeric

types. Added a new API function, PyUnicode_TransformDecimalToASCII(), which transforms non-ASCII decimal digits in a Unicode string to their ASCII equivalents.
2024-11-23 18:04:37 +08:00 · 2010-12-04 03:38:46 +00:00 · 2010-12-04 03:38:46 +00:00 · 942af5a9a4
commit 942af5a9a4
parent 36526bf3d9
11 changed files with 169 additions and 52 deletions
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@ -328,6 +328,13 @@ APIs:
   Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two
   arguments.

+.. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size)
+
+   Create a Unicode object by replacing all decimal digits in
+   :c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9
+   according to their decimal value.  Return *NULL* if an exception
+   occurs.
+

 .. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode)

--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1225,6 +1225,17 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
    );
 #endif

+/* Transforms code points that have decimal digit property to the
+   corresponding ASCII digit code points.
+
+   Returns a new Unicode string on success, NULL on failure.
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
+    Py_UNICODE *s,              /* Unicode buffer */
+    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
+    );
+
 /* --- File system encoding ---------------------------------------------- */

 /* ParseTuple converter: encode str objects to bytes using
--- a/Lib/test/test_complex.py
+++ b/Lib/test/test_complex.py
@ -220,6 +220,7 @@ class ComplexTest(unittest.TestCase):
        self.assertEqual(complex(NS(1+10j)), 1+10j)
        self.assertRaises(TypeError, complex, OS(None))
        self.assertRaises(TypeError, complex, NS(None))
+        self.assertRaises(TypeError, complex, {})

        self.assertAlmostEqual(complex("1+10j"), 1+10j)
        self.assertAlmostEqual(complex(10), 10+0j)
@ -325,6 +326,8 @@ class ComplexTest(unittest.TestCase):

        # check that complex accepts long unicode strings
        self.assertEqual(type(complex("1"*500)), complex)
+        # check whitespace processing
+        self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j)

        class EvilExc(Exception):
            pass
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@ -43,14 +43,30 @@ class GeneralFloatCases(unittest.TestCase):
        self.assertRaises(ValueError, float, "+.inf")
        self.assertRaises(ValueError, float, ".")
        self.assertRaises(ValueError, float, "-.")
+        self.assertRaises(ValueError, float, b"-")
+        self.assertRaises(TypeError, float, {})
+        # Lone surrogate
+        self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
        # check that we don't accept alternate exponent markers
        self.assertRaises(ValueError, float, "-1.7d29")
        self.assertRaises(ValueError, float, "3D-14")
-        self.assertEqual(float(b"  \u0663.\u0661\u0664  ".decode('raw-unicode-escape')), 3.14)
+        self.assertEqual(float("  \u0663.\u0661\u0664  "), 3.14)
+        self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14)
        # extra long strings should not be a problem
        float(b'.' + b'1'*1000)
        float('.' + '1'*1000)

+    def test_error_message(self):
+        testlist = ('\xbd', '123\xbd', '  123 456  ')
+        for s in testlist:
+            try:
+                float(s)
+            except ValueError as e:
+                self.assertIn(s.strip(), e.args[0])
+            else:
+                self.fail("Expected int(%r) to raise a ValueError", s)
+
+
    @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE')
    def test_float_with_comma(self):
        # set locale to something that doesn't use '.' for the decimal point
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@ -20,7 +20,8 @@ L = [
        ('  1\02  ', ValueError),
        ('', ValueError),
        (' ', ValueError),
-        ('  \t\t  ', ValueError)
+        ('  \t\t  ', ValueError),
+        ("\u0200", ValueError)
 ]

 class IntTestCases(unittest.TestCase):
@ -35,6 +36,8 @@ class IntTestCases(unittest.TestCase):
        self.assertEqual(int(3.5), 3)
        self.assertEqual(int(-3.5), -3)
        self.assertEqual(int("-3"), -3)
+        self.assertEqual(int(" -3 "), -3)
+        self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3)
        # Different base:
        self.assertEqual(int("10",16), 16)
        # Test conversion from strings and various anomalies
@ -302,6 +305,16 @@ class IntTestCases(unittest.TestCase):
                    self.fail("Failed to raise TypeError with %s" %
                              ((base, trunc_result_base),))

+    def test_error_message(self):
+        testlist = ('\xbd', '123\xbd', '  123 456  ')
+        for s in testlist:
+            try:
+                int(s)
+            except ValueError as e:
+                self.assertIn(s.strip(), e.args[0])
+            else:
+                self.fail("Expected int(%r) to raise a ValueError", s)
+
 def test_main():
    run_unittest(IntTestCases)

--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1168,8 +1168,13 @@ class UnicodeTest(string_tests.CommonTest,
        # Error handling (wrong arguments)
        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)

-        # Error handling (PyUnicode_EncodeDecimal())
-        self.assertRaises(UnicodeError, int, "\u0200")
+        # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
+        self.assertRaises(UnicodeError, int, "\ud800")
+        self.assertRaises(UnicodeError, int, "\udf00")
+        self.assertRaises(UnicodeError, float, "\ud800")
+        self.assertRaises(UnicodeError, float, "\udf00")
+        self.assertRaises(UnicodeError, complex, "\ud800")
+        self.assertRaises(UnicodeError, complex, "\udf00")

    def test_codecs(self):
        # Encoding
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -222,6 +222,10 @@ Library
 C-API
 -----

+- Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(),
+  which transforms non-ASCII decimal digits in a Unicode string to their
+  ASCII equivalents. 
+
 - Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly
  zero-initialize all fields, fixing compiler warnings seen when building
  extension modules with gcc with "-Wmissing-field-initializers" (implied by
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@ -766,20 +766,26 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
    char *end;
    double x=0.0, y=0.0, z;
    int got_bracket=0;
-    char *s_buffer = NULL;
+    PyObject *s_buffer = NULL;
    Py_ssize_t len;

    if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1);
+        Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
+        Py_UNICODE *bufptr;
+        s_buffer = PyUnicode_TransformDecimalToASCII(
+            PyUnicode_AS_UNICODE(v), buflen);
        if (s_buffer == NULL)
-            return PyErr_NoMemory();
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
-                                    PyUnicode_GET_SIZE(v),
-                                    s_buffer,
-                                    NULL))
+            return NULL;
+        /* Replace non-ASCII whitespace with ' ' */
+        bufptr = PyUnicode_AS_UNICODE(s_buffer);
+        for (i = 0; i < buflen; i++) {
+            Py_UNICODE ch = bufptr[i];
+            if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+                bufptr[i] = ' ';
+        }
+        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+        if (s == NULL)
            goto error;
-        s = s_buffer;
-        len = strlen(s);
    }
    else if (PyObject_AsCharBuffer(v, &s, &len)) {
        PyErr_SetString(PyExc_TypeError,
@ -894,16 +900,14 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
    if (s-start != len)
        goto parse_error;

-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    Py_XDECREF(s_buffer);
    return complex_subtype_from_doubles(type, x, y);

  parse_error:
    PyErr_SetString(PyExc_ValueError,
                    "complex() arg is a malformed string");
  error:
-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    Py_XDECREF(s_buffer);
    return NULL;
 }

--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@ -174,22 +174,30 @@ PyFloat_FromString(PyObject *v)
 {
    const char *s, *last, *end;
    double x;
-    char buffer[256]; /* for errors */
-    char *s_buffer = NULL;
+    PyObject *s_buffer = NULL;
    Py_ssize_t len;
    PyObject *result = NULL;

    if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1);
+        Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
+        Py_UNICODE *bufptr;
+        s_buffer = PyUnicode_TransformDecimalToASCII(
+            PyUnicode_AS_UNICODE(v), buflen);
        if (s_buffer == NULL)
-            return PyErr_NoMemory();
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
-                                    PyUnicode_GET_SIZE(v),
-                                    s_buffer,
-                                    NULL))
-            goto error;
-        s = s_buffer;
-        len = strlen(s);
+            return NULL;
+        /* Replace non-ASCII whitespace with ' ' */
+        bufptr = PyUnicode_AS_UNICODE(s_buffer);
+        for (i = 0; i < buflen; i++) {
+            Py_UNICODE ch = bufptr[i];
+            if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+                bufptr[i] = ' ';
+        }
+        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
+        if (s == NULL) {
+            Py_DECREF(s_buffer);
+            return NULL;
+        }
+        last = s + len;
    }
    else if (PyObject_AsCharBuffer(v, &s, &len)) {
        PyErr_SetString(PyExc_TypeError,
@ -197,29 +205,27 @@ PyFloat_FromString(PyObject *v)
        return NULL;
    }
    last = s + len;
-
-    while (Py_ISSPACE(*s))
+    /* strip space */
+    while (s < last && Py_ISSPACE(*s))
        s++;
+    while (s < last - 1 && Py_ISSPACE(last[-1]))
+        last--;
    /* We don't care about overflow or underflow.  If the platform
     * supports them, infinities and signed zeroes (on underflow) are
     * fine. */
    x = PyOS_string_to_double(s, (char **)&end, NULL);
-    if (x == -1.0 && PyErr_Occurred())
-        goto error;
-    while (Py_ISSPACE(*end))
-        end++;
-    if (end == last)
-        result = PyFloat_FromDouble(x);
-    else {
-        PyOS_snprintf(buffer, sizeof(buffer),
-                      "invalid literal for float(): %.200s", s);
-        PyErr_SetString(PyExc_ValueError, buffer);
+    if (end != last) {
+        PyErr_Format(PyExc_ValueError,
+                     "could not convert string to float: "
+                     "%R", v);
        result = NULL;
    }
+    else if (x == -1.0 && PyErr_Occurred())
+        result = NULL;
+    else
+        result = PyFloat_FromDouble(x);

-  error:
-    if (s_buffer)
-        PyMem_FREE(s_buffer);
+    Py_XDECREF(s_buffer);
    return result;
 }

--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@ -2133,17 +2133,34 @@ PyObject *
 PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base)
 {
    PyObject *result;
-    char *buffer = (char *)PyMem_MALLOC(length+1);
+    PyObject *asciidig;
+    char *buffer, *end;
+    Py_ssize_t i, buflen;
+    Py_UNICODE *ptr;

-    if (buffer == NULL)
+    asciidig = PyUnicode_TransformDecimalToASCII(u, length);
+    if (asciidig == NULL)
        return NULL;
-
-    if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) {
-        PyMem_FREE(buffer);
+    /* Replace non-ASCII whitespace with ' ' */
+    ptr = PyUnicode_AS_UNICODE(asciidig);
+    for (i = 0; i < length; i++) {
+      Py_UNICODE ch = ptr[i];
+      if (ch > 127 && Py_UNICODE_ISSPACE(ch))
+        ptr[i] = ' ';
+    }
+    buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen);
+    if (buffer == NULL) {
+        Py_DECREF(asciidig);
        return NULL;
    }
-    result = PyLong_FromString(buffer, NULL, base);
-    PyMem_FREE(buffer);
+    result = PyLong_FromString(buffer, &end, base);
+    if (result != NULL && end != buffer + buflen) {
+        PyErr_SetString(PyExc_ValueError,
+                        "null byte in argument for int()");
+        Py_DECREF(result);
+        result = NULL;
+    }
+    Py_DECREF(asciidig);
    return result;
 }

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6206,6 +6206,30 @@ PyObject *PyUnicode_Translate(PyObject *str,
    return NULL;
 }

+PyObject *
+PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
+                                  Py_ssize_t length)
+{
+    PyObject *result;
+    Py_UNICODE *p; /* write pointer into result */
+    Py_ssize_t i;
+    /* Copy to a new string */
+    result = (PyObject *)_PyUnicode_New(length);
+    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
+    if (result == NULL)
+        return result;
+    p = PyUnicode_AS_UNICODE(result);
+    /* Iterate over code points */
+    for (i = 0; i < length; i++) {
+        Py_UNICODE ch =s[i];
+        if (ch > 127) {
+            int decimal = Py_UNICODE_TODECIMAL(ch);
+            if (decimal >= 0)
+                p[i] = '0' + decimal;
+        }
+    }
+    return result;
+}
 /* --- Decimal Encoder ---------------------------------------------------- */

 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
@ -8967,6 +8991,13 @@ unicode_freelistsize(PyUnicodeObject *self)
 {
    return PyLong_FromLong(numfree);
 }
+
+static PyObject *
+unicode__decimal2ascii(PyObject *self)
+{
+    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
+                                             PyUnicode_GET_SIZE(self));
+}
 #endif

 PyDoc_STRVAR(startswith__doc__,
@ -9108,7 +9139,6 @@ unicode_getnewargs(PyUnicodeObject *v)
    return Py_BuildValue("(u#)", v->str, v->length);
 }

-
 static PyMethodDef unicode_methods[] = {

    /* Order is according to common usage: often used methods should
@ -9170,8 +9200,9 @@ static PyMethodDef unicode_methods[] = {
 #endif

 #if 0
-    /* This one is just used for debugging the implementation. */
+    /* These methods are just used for debugging the implementation. */
    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
+    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
 #endif

    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},