Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates (previously it raised an exception which causes round-trips to fail). Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for marshalling Unicode objects, so we better make sure it works for all Unicode code points, including isolated surrogates). Bumped the PYC magic in a non-standard way -- please review. This was needed because the old PYC format used illegal UTF-8 sequences for isolated high surrogates which now raise an exception.
2024-11-24 02:15:30 +08:00 · 2002-02-07 11:33:49 +00:00 · 2002-02-07 11:33:49 +00:00 · bd3be8f0ca
commit bd3be8f0ca
parent 9273ec726c
4 changed files with 71 additions and 31 deletions
--- a/Lib/test/output/test_unicodedata
+++ b/Lib/test/output/test_unicodedata
@ -1,5 +1,5 @@
 test_unicodedata
 Testing Unicode Database...
-Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18
+Methods: 84b72943b1d4320bc1e64a4888f7cdf62eea219a
 Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
 API: ok
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -23,21 +23,23 @@ if not sys.platform.startswith('java'):
    verify(repr(u"'\"") == """u'\\'"'""")
    verify(repr(u"'") == '''u"'"''')
    verify(repr(u'"') == """u'"'""")
-    verify(repr(u''.join(map(unichr, range(256)))) ==
-       "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
-       "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
-       "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
-       "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
-       "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
-       "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
-       "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
-       "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
-       "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
-       "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
-       "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
-       "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
-       "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
-       "\\xfe\\xff'")
+    latin1repr = (
+        "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
+        "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
+        "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
+        "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
+        "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
+        "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
+        "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
+        "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
+        "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
+        "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
+        "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
+        "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
+        "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
+        "\\xfe\\xff'")
+    testrepr = repr(u''.join(map(unichr, range(256))))
+    verify(testrepr == latin1repr)

 def test(method, input, output, *args):
    if verbose:
@ -495,6 +497,7 @@ else:
 verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')

 # UTF-8 specific encoding tests:
+verify(u''.encode('utf-8') == '')
 verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
 verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
 verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
@ -552,14 +555,7 @@ for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
    verify(unicode(u.encode(encoding),encoding) == u)

-# Roundtrip safety for non-BMP (just a few chars)
-u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
-for encoding in ('utf-8',
-                 'utf-16', 'utf-16-le', 'utf-16-be',
-                 #'raw_unicode_escape',
-                 'unicode_escape', 'unicode_internal'):
-    verify(unicode(u.encode(encoding),encoding) == u)
-
+# Roundtrip safety for BMP (just the first 256 chars)
 u = u''.join(map(unichr, range(256)))
 for encoding in (
    'latin-1',
@ -571,6 +567,7 @@ for encoding in (
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)

+# Roundtrip safety for BMP (just the first 128 chars)
 u = u''.join(map(unichr, range(128)))
 for encoding in (
    'ascii',
@ -582,6 +579,19 @@ for encoding in (
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)

+# Roundtrip safety for non-BMP (just a few chars)
+u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
+for encoding in ('utf-8',
+                 'utf-16', 'utf-16-le', 'utf-16-be',
+                 #'raw_unicode_escape',
+                 'unicode_escape', 'unicode_internal'):
+    verify(unicode(u.encode(encoding),encoding) == u)
+
+# UTF-8 must be roundtrip safe for all UCS-2 code points
+u = u''.join(map(unichr, range(0x10000)))
+for encoding in ('utf-8',):
+    verify(unicode(u.encode(encoding),encoding) == u)
+
 print 'done.'

 print 'Testing standard mapping codecs...',
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1065,12 +1065,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 		goto utf8Error;
 	    }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
-            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
+            if (ch < 0x0800) {
+		/* Note: UTF-8 encodings of surrogates are considered
+		   legal UTF-8 sequences; 
+
+		   XXX For wide builds (UCS-4) we should probably try
+		       to recombine the surrogates into a single code
+		       unit.
+		*/
                errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
 	    else
-				*p++ = (Py_UNICODE)ch;
+		*p++ = (Py_UNICODE)ch;
            break;

        case 4:
@ -1084,9 +1091,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            /* validate and convert to UTF-16 */
            if ((ch < 0x10000)        /* minimum value allowed for 4
-                                       byte encoding */
+					 byte encoding */
                || (ch > 0x10ffff))   /* maximum value allowed for
-                                       UTF-16 */
+					 UTF-16 */
 	    {
                errmsg = "illegal encoding";
 		goto utf8Error;
@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
    unsigned int cbWritten = 0;
    int i = 0;

+    /* Short-cut for emtpy strings */
+    if (size == 0)
+	return PyString_FromStringAndSize(NULL, 0);
+
+    /* We allocate 4 more bytes to have room for at least one full
+       UTF-8 sequence; saves a few cycles in the loop below */
    v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
    if (v == NULL)
        return NULL;
-    if (size == 0)
-        return v;

    p = PyString_AS_STRING(v);
    while (i < size) {
--- a/Python/import.c
+++ b/Python/import.c
@ -41,8 +41,27 @@ extern time_t PyOS_GetLastModificationTime(char *, FILE *);
       the Unicode -U option is in use.  IMO (Tim's), that's a Bad Idea
       (quite apart from that the -U option doesn't work so isn't used
       anyway).
+
+   XXX MAL, 2002-02-07: I had to modify the MAGIC due to a fix of the
+       UTF-8 encoder (it previously produced invalid UTF-8 for unpaired
+       high surrogates), so I simply bumped the month value to 20 (invalid
+       month) and set the day to 1.  This should be recognizable by any
+       algorithm relying on the above scheme. Perhaps we should simply
+       start counting in increments of 10 from now on ?!
+
+   Known values:
+       Python 1.5:   20121
+       Python 1.5.1: 20121
+       Python 1.5.2: 20121
+       Python 2.0:   50823
+       Python 2.0.1: 50823
+       Python 2.1:   60202
+       Python 2.1.1: 60202
+       Python 2.1.2: 60202
+       Python 2.2:   60717
+       Python 2.3a0: 62001
 */
-#define MAGIC (60717 | ((long)'\r'<<16) | ((long)'\n'<<24))
+#define MAGIC (62001 | ((long)'\r'<<16) | ((long)'\n'<<24))

 /* Magic word as global; note that _PyImport_Init() can change the
   value of this global to accommodate for alterations of how the