Issue #19619: Blacklist non-text codecs in method API

str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings.
2024-11-23 18:04:37 +08:00 · 2013-11-22 22:39:36 +10:00 · 2013-11-22 22:39:36 +10:00 · c72e4e6dcc
commit c72e4e6dcc
parent 322f5ba0d8
13 changed files with 291 additions and 93 deletions
--- a/Include/codecs.h
+++ b/Include/codecs.h
@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
       const char *errors
       );
 #ifndef PY_LIMITED_API
 /* Text codec specific encoding and decoding API.
   Checks the encoding against a list of codecs which do not
   implement a str<->bytes encoding before attempting the
   operation.
   Please note that these APIs are internal and should not
   be used in Python C extensions.
 */
 PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
       PyObject *object,
       const char *encoding,
       const char *errors
       );
 PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
       PyObject *object,
       const char *encoding,
       const char *errors
       );
 #endif
 /* --- Codec Lookup APIs -------------------------------------------------- 
   All APIs return a codec object with incremented refcount and are
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
 ### Codec base classes (defining the API)
 class CodecInfo(tuple):
    """Codec details when looking up the codec registry"""
    # Private API to allow Python 3.4 to blacklist the known non-Unicode
    # codecs in the standard library. A more general mechanism to
    # reliably distinguish test encodings from other codecs will hopefully
    # be defined for Python 3.5
    #
    # See http://bugs.python.org/issue19619
    _is_text_encoding = True # Assume codecs are text encodings by default
    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
-        incrementalencoder=None, incrementaldecoder=None, name=None):
+        incrementalencoder=None, incrementaldecoder=None, name=None,
        *, _is_text_encoding=None):
        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
        self.name = name
        self.encode = encode
@ -84,6 +94,8 @@ class CodecInfo(tuple):
        self.incrementaldecoder = incrementaldecoder
        self.streamwriter = streamwriter
        self.streamreader = streamreader
        if _is_text_encoding is not None:
            self._is_text_encoding = _is_text_encoding
        return self
    def __repr__(self):
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py
@ -52,4 +52,5 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
        _is_text_encoding=False,
    )
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py
@ -74,4 +74,5 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
        _is_text_encoding=False,
    )
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py
@ -52,4 +52,5 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
        _is_text_encoding=False,
    )
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py
@ -53,4 +53,5 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
        _is_text_encoding=False,
    )
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py
@ -43,6 +43,7 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamwriter=StreamWriter,
        streamreader=StreamReader,
        _is_text_encoding=False,
    )
 ### Map
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py
@ -96,4 +96,5 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
        _is_text_encoding=False,
    )
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py
@ -74,4 +74,5 @@ def getregentry():
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
        _is_text_encoding=False,
    )
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -6,6 +6,7 @@ import locale
 import sys
 import unittest
 import warnings
 import encodings
 from test import support
@ -2381,67 +2382,68 @@ class TransformCodecTest(unittest.TestCase):
                view_decoded = codecs.decode(view, encoding)
                self.assertEqual(view_decoded, data)
-    def test_type_error_for_text_input(self):
+    def test_text_to_binary_blacklists_binary_transforms(self):
        # Check binary -> binary codecs give a good error for str input
        bad_input = "bad input type"
        for encoding in bytes_transform_encodings:
            with self.subTest(encoding=encoding):
-                msg = "^encoding with '{}' codec failed".format(encoding)
+                fmt = ( "{!r} is not a text encoding; "
-                with self.assertRaisesRegex(TypeError, msg) as failure:
+                        "use codecs.encode\(\) to handle arbitrary codecs")
                msg = fmt.format(encoding)
                with self.assertRaisesRegex(LookupError, msg) as failure:
                    bad_input.encode(encoding)
-                self.assertTrue(isinstance(failure.exception.__cause__,
+                self.assertIsNone(failure.exception.__cause__)
                                           TypeError))
-    def test_type_error_for_binary_input(self):
+    def test_text_to_binary_blacklists_text_transforms(self):
-        # Check str -> str codec gives a good error for binary input
+        # Check str.encode gives a good error message for str -> str codecs
-        for bad_input in (b"immutable", bytearray(b"mutable")):
+        msg = (r"^'rot_13' is not a text encoding; "
-            with self.subTest(bad_input=bad_input):
+                "use codecs.encode\(\) to handle arbitrary codecs")
-                msg = "^decoding with 'rot_13' codec failed"
+        with self.assertRaisesRegex(LookupError, msg):
-                with self.assertRaisesRegex(AttributeError, msg) as failure:
+            "just an example message".encode("rot_13")
                    bad_input.decode("rot_13")
                self.assertTrue(isinstance(failure.exception.__cause__,
                                           AttributeError))
-    def test_custom_zlib_error_is_wrapped(self):
+    def test_binary_to_text_blacklists_binary_transforms(self):
        # Check zlib codec gives a good error for malformed input
        msg = "^decoding with 'zlib_codec' codec failed"
        with self.assertRaisesRegex(Exception, msg) as failure:
            b"hello".decode("zlib_codec")
        self.assertTrue(isinstance(failure.exception.__cause__,
                                   type(failure.exception)))
    def test_custom_hex_error_is_wrapped(self):
        # Check hex codec gives a good error for malformed input
        msg = "^decoding with 'hex_codec' codec failed"
        with self.assertRaisesRegex(Exception, msg) as failure:
            b"hello".decode("hex_codec")
        self.assertTrue(isinstance(failure.exception.__cause__,
                                   type(failure.exception)))
    # Unfortunately, the bz2 module throws OSError, which the codec
    # machinery currently can't wrap :(
    def test_bad_decoding_output_type(self):
        # Check bytes.decode and bytearray.decode give a good error
        # message for binary -> binary codecs
        data = b"encode first to ensure we meet any format restrictions"
        for encoding in bytes_transform_encodings:
            with self.subTest(encoding=encoding):
                encoded_data = codecs.encode(data, encoding)
-                fmt = ("'{}' decoder returned 'bytes' instead of 'str'; "
+                fmt = (r"{!r} is not a text encoding; "
-                       "use codecs.decode\(\) to decode to arbitrary types")
+                        "use codecs.decode\(\) to handle arbitrary codecs")
                msg = fmt.format(encoding)
-                with self.assertRaisesRegex(TypeError, msg):
+                with self.assertRaisesRegex(LookupError, msg):
                    encoded_data.decode(encoding)
-                with self.assertRaisesRegex(TypeError, msg):
+                with self.assertRaisesRegex(LookupError, msg):
                    bytearray(encoded_data).decode(encoding)
-    def test_bad_encoding_output_type(self):
+    def test_binary_to_text_blacklists_text_transforms(self):
-        # Check str.encode gives a good error message for str -> str codecs
+        # Check str -> str codec gives a good error for binary input
-        msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; "
+        for bad_input in (b"immutable", bytearray(b"mutable")):
-               "use codecs.encode\(\) to encode to arbitrary types")
+            with self.subTest(bad_input=bad_input):
-        with self.assertRaisesRegex(TypeError, msg):
+                msg = (r"^'rot_13' is not a text encoding; "
-            "just an example message".encode("rot_13")
+                        "use codecs.decode\(\) to handle arbitrary codecs")
                with self.assertRaisesRegex(LookupError, msg) as failure:
                    bad_input.decode("rot_13")
                self.assertIsNone(failure.exception.__cause__)
    def test_custom_zlib_error_is_wrapped(self):
        # Check zlib codec gives a good error for malformed input
        msg = "^decoding with 'zlib_codec' codec failed"
        with self.assertRaisesRegex(Exception, msg) as failure:
            codecs.decode(b"hello", "zlib_codec")
        self.assertIsInstance(failure.exception.__cause__,
                                                type(failure.exception))
    def test_custom_hex_error_is_wrapped(self):
        # Check hex codec gives a good error for malformed input
        msg = "^decoding with 'hex_codec' codec failed"
        with self.assertRaisesRegex(Exception, msg) as failure:
            codecs.decode(b"hello", "hex_codec")
        self.assertIsInstance(failure.exception.__cause__,
                                                type(failure.exception))
    # Unfortunately, the bz2 module throws OSError, which the codec
    # machinery currently can't wrap :(
 # The codec system tries to wrap exceptions in order to ensure the error
@ -2466,27 +2468,44 @@ class ExceptionChainingTest(unittest.TestCase):
        # case finishes by using the test case repr as the codec name
        # The codecs module normalizes codec names, although this doesn't
        # appear to be formally documented...
-        self.codec_name = repr(self).lower().replace(" ", "-")
+        # We also make sure we use a truly unique id for the custom codec
        # to avoid issues with the codec cache when running these tests
        # multiple times (e.g. when hunting for refleaks)
        unique_id = repr(self) + str(id(self))
        self.codec_name = encodings.normalize_encoding(unique_id).lower()
        # We store the object to raise on the instance because of a bad
        # interaction between the codec caching (which means we can't
        # recreate the codec entry) and regrtest refleak hunting (which
        # runs the same test instance multiple times). This means we
        # need to ensure the codecs call back in to the instance to find
        # out which exception to raise rather than binding them in a
        # closure to an object that may change on the next run
        self.obj_to_raise = RuntimeError
    def tearDown(self):
        _TEST_CODECS.pop(self.codec_name, None)
-    def set_codec(self, obj_to_raise):
+    def set_codec(self, encode, decode):
-        def raise_obj(*args, **kwds):
+        codec_info = codecs.CodecInfo(encode, decode,
            raise obj_to_raise
        codec_info = codecs.CodecInfo(raise_obj, raise_obj,
                                      name=self.codec_name)
        _TEST_CODECS[self.codec_name] = codec_info
    @contextlib.contextmanager
    def assertWrapped(self, operation, exc_type, msg):
-        full_msg = "{} with '{}' codec failed \({}: {}\)".format(
+        full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
                  operation, self.codec_name, exc_type.__name__, msg)
        with self.assertRaisesRegex(exc_type, full_msg) as caught:
            yield caught
        self.assertIsInstance(caught.exception.__cause__, exc_type)
    def raise_obj(self, *args, **kwds):
        # Helper to dynamically change the object raised by a test codec
        raise self.obj_to_raise
    def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
-        self.set_codec(obj_to_raise)
+        self.obj_to_raise = obj_to_raise
        self.set_codec(self.raise_obj, self.raise_obj)
        with self.assertWrapped("encoding", exc_type, msg):
            "str_input".encode(self.codec_name)
        with self.assertWrapped("encoding", exc_type, msg):
@ -2515,23 +2534,17 @@ class ExceptionChainingTest(unittest.TestCase):
            pass
        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
-    @contextlib.contextmanager
+    def check_not_wrapped(self, obj_to_raise, msg):
-    def assertNotWrapped(self, operation, exc_type, msg_re, msg=None):
+        def raise_obj(*args, **kwds):
-        if msg is None:
+            raise obj_to_raise
-            msg = msg_re
+        self.set_codec(raise_obj, raise_obj)
-        with self.assertRaisesRegex(exc_type, msg) as caught:
+        with self.assertRaisesRegex(RuntimeError, msg):
            yield caught
        self.assertEqual(str(caught.exception), msg)
    def check_not_wrapped(self, obj_to_raise, msg_re, msg=None):
        self.set_codec(obj_to_raise)
        with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
            "str input".encode(self.codec_name)
-        with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
+        with self.assertRaisesRegex(RuntimeError, msg):
            codecs.encode("str input", self.codec_name)
-        with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
+        with self.assertRaisesRegex(RuntimeError, msg):
            b"bytes input".decode(self.codec_name)
-        with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
+        with self.assertRaisesRegex(RuntimeError, msg):
            codecs.decode(b"bytes input", self.codec_name)
    def test_init_override_is_not_wrapped(self):
@ -2550,29 +2563,56 @@ class ExceptionChainingTest(unittest.TestCase):
        msg = "This should NOT be wrapped"
        exc = RuntimeError(msg)
        exc.attr = 1
-        self.check_not_wrapped(exc, msg)
+        self.check_not_wrapped(exc, "^{}$".format(msg))
    def test_non_str_arg_is_not_wrapped(self):
        self.check_not_wrapped(RuntimeError(1), "1")
    def test_multiple_args_is_not_wrapped(self):
-        msg_re = "\('a', 'b', 'c'\)"
+        msg_re = r"^\('a', 'b', 'c'\)$"
-        msg = "('a', 'b', 'c')"
+        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re, msg)
    # http://bugs.python.org/issue19609
    def test_codec_lookup_failure_not_wrapped(self):
-        msg = "unknown encoding: %s" % self.codec_name
+        msg = "^unknown encoding: {}$".format(self.codec_name)
        # The initial codec lookup should not be wrapped
-        with self.assertNotWrapped("encoding", LookupError, msg):
+        with self.assertRaisesRegex(LookupError, msg):
            "str input".encode(self.codec_name)
-        with self.assertNotWrapped("encoding", LookupError, msg):
+        with self.assertRaisesRegex(LookupError, msg):
            codecs.encode("str input", self.codec_name)
-        with self.assertNotWrapped("decoding", LookupError, msg):
+        with self.assertRaisesRegex(LookupError, msg):
            b"bytes input".decode(self.codec_name)
-        with self.assertNotWrapped("decoding", LookupError, msg):
+        with self.assertRaisesRegex(LookupError, msg):
            codecs.decode(b"bytes input", self.codec_name)
    def test_unflagged_non_text_codec_handling(self):
        # The stdlib non-text codecs are now marked so they're
        # pre-emptively skipped by the text model related methods
        # However, third party codecs won't be flagged, so we still make
        # sure the case where an inappropriate output type is produced is
        # handled appropriately
        def encode_to_str(*args, **kwds):
            return "not bytes!", 0
        def decode_to_bytes(*args, **kwds):
            return b"not str!", 0
        self.set_codec(encode_to_str, decode_to_bytes)
        # No input or output type checks on the codecs module functions
        encoded = codecs.encode(None, self.codec_name)
        self.assertEqual(encoded, "not bytes!")
        decoded = codecs.decode(None, self.codec_name)
        self.assertEqual(decoded, b"not str!")
        # Text model methods should complain
        fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
                "use codecs.encode\(\) to encode to arbitrary types$")
        msg = fmt.format(self.codec_name)
        with self.assertRaisesRegex(TypeError, msg):
            "str_input".encode(self.codec_name)
        fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
                "use codecs.decode\(\) to decode to arbitrary types$")
        msg = fmt.format(self.codec_name)
        with self.assertRaisesRegex(TypeError, msg):
            b"bytes input".decode(self.codec_name)
@unittest.skipUnless(sys.platform == 'win32',
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,12 @@ Projected release date: 2013-11-24
 Core and Builtins
 -----------------
 - Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
  internal API to throw LookupError for known non-text encodings, rather
  than attempting the encoding or decoding operation and then throwing a
  TypeError for an unexpected output type. (The latter mechanism remains
  in place for third party non-text encodings)
 - Issue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'.
  Python now uses SipHash24 on all major platforms.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3044,7 +3044,7 @@ PyUnicode_Decode(const char *s,
    buffer = PyMemoryView_FromBuffer(&info);
    if (buffer == NULL)
        goto onError;
-    unicode = PyCodec_Decode(buffer, encoding, errors);
+    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
    if (unicode == NULL)
        goto onError;
    if (!PyUnicode_Check(unicode)) {
@ -3410,7 +3410,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
    }
    /* Encode via the codec registry */
-    v = PyCodec_Encode(unicode, encoding, errors);
+    v = _PyCodec_EncodeText(unicode, encoding, errors);
    if (v == NULL)
        return NULL;
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -353,18 +353,15 @@ wrap_codec_error(const char *operation,
   errors is passed to the encoder factory as argument if non-NULL. */
-PyObject *PyCodec_Encode(PyObject *object,
+static PyObject *
-                         const char *encoding,
+_PyCodec_EncodeInternal(PyObject *object,
-                         const char *errors)
+                        PyObject *encoder,
                        const char *encoding,
                        const char *errors)
 {
    PyObject *encoder = NULL;
    PyObject *args = NULL, *result = NULL;
    PyObject *v = NULL;
    encoder = PyCodec_Encoder(encoding);
    if (encoder == NULL)
        goto onError;
    args = args_tuple(object, errors);
    if (args == NULL)
        goto onError;
@ -402,18 +399,15 @@ PyObject *PyCodec_Encode(PyObject *object,
   errors is passed to the decoder factory as argument if non-NULL. */
-PyObject *PyCodec_Decode(PyObject *object,
+static PyObject *
-                         const char *encoding,
+_PyCodec_DecodeInternal(PyObject *object,
-                         const char *errors)
+                        PyObject *decoder,
                        const char *encoding,
                        const char *errors)
 {
    PyObject *decoder = NULL;
    PyObject *args = NULL, *result = NULL;
    PyObject *v;
    decoder = PyCodec_Decoder(encoding);
    if (decoder == NULL)
        goto onError;
    args = args_tuple(object, errors);
    if (args == NULL)
        goto onError;
@ -445,6 +439,118 @@ PyObject *PyCodec_Decode(PyObject *object,
    return NULL;
 }
 /* Generic encoding/decoding API */
 PyObject *PyCodec_Encode(PyObject *object,
                         const char *encoding,
                         const char *errors)
 {
    PyObject *encoder;
    encoder = PyCodec_Encoder(encoding);
    if (encoder == NULL)
        return NULL;
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
 }
 PyObject *PyCodec_Decode(PyObject *object,
                         const char *encoding,
                         const char *errors)
 {
    PyObject *decoder;
    decoder = PyCodec_Decoder(encoding);
    if (decoder == NULL)
        return NULL;
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
 }
 /* Text encoding/decoding API */
 static
 PyObject *codec_getitem_checked(const char *encoding,
                                const char *operation_name,
                                int index)
 {
    _Py_IDENTIFIER(_is_text_encoding);
    PyObject *codec;
    PyObject *attr;
    PyObject *v;
    int is_text_codec;
    codec = _PyCodec_Lookup(encoding);
    if (codec == NULL)
        return NULL;
    /* Backwards compatibility: assume any raw tuple describes a text
     * encoding, and the same for anything lacking the private
     * attribute.
     */
    if (!PyTuple_CheckExact(codec)) {
        attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
        if (attr == NULL) {
            if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
                PyErr_Clear();
            } else {
                Py_DECREF(codec);
                return NULL;
            }
        } else {
            is_text_codec = PyObject_IsTrue(attr);
            Py_DECREF(attr);
            if (!is_text_codec) {
                Py_DECREF(codec);
                PyErr_Format(PyExc_LookupError,
                             "'%.400s' is not a text encoding; "
                             "use codecs.%s() to handle arbitrary codecs",
                             encoding, operation_name);
                return NULL;
            }
        }
    }
    v = PyTuple_GET_ITEM(codec, index);
    Py_DECREF(codec);
    Py_INCREF(v);
    return v;
 }
 static PyObject * _PyCodec_TextEncoder(const char *encoding)
 {
    return codec_getitem_checked(encoding, "encode", 0);
 }
 static PyObject * _PyCodec_TextDecoder(const char *encoding)
 {
    return codec_getitem_checked(encoding, "decode", 1);
 }
 PyObject *_PyCodec_EncodeText(PyObject *object,
                              const char *encoding,
                              const char *errors)
 {
    PyObject *encoder;
    encoder = _PyCodec_TextEncoder(encoding);
    if (encoder == NULL)
        return NULL;
    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
 }
 PyObject *_PyCodec_DecodeText(PyObject *object,
                              const char *encoding,
                              const char *errors)
 {
    PyObject *decoder;
    decoder = _PyCodec_TextDecoder(encoding);
    if (decoder == NULL)
        return NULL;
    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
 }
 /* Register the error handling callback function error under the name
   name. This function will be called by the codec when it encounters
   an unencodable characters/undecodable bytes and doesn't know the