From c72e4e6dccce99bcdcb45959767436d7e5cfda8c Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Fri, 22 Nov 2013 22:39:36 +1000 Subject: [PATCH] Issue #19619: Blacklist non-text codecs in method API str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings. --- Include/codecs.h | 27 +++++ Lib/codecs.py | 14 ++- Lib/encodings/base64_codec.py | 1 + Lib/encodings/bz2_codec.py | 1 + Lib/encodings/hex_codec.py | 1 + Lib/encodings/quopri_codec.py | 1 + Lib/encodings/rot_13.py | 1 + Lib/encodings/uu_codec.py | 1 + Lib/encodings/zlib_codec.py | 1 + Lib/test/test_codecs.py | 188 +++++++++++++++++++++------------- Misc/NEWS | 6 ++ Objects/unicodeobject.c | 4 +- Python/codecs.c | 138 ++++++++++++++++++++++--- 13 files changed, 291 insertions(+), 93 deletions(-) diff --git a/Include/codecs.h b/Include/codecs.h index 0d9e9b4585d..5ca505fbd5e 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode( const char *errors ); +#ifndef PY_LIMITED_API +/* Text codec specific encoding and decoding API. + + Checks the encoding against a list of codecs which do not + implement a str<->bytes encoding before attempting the + operation. + + Please note that these APIs are internal and should not + be used in Python C extensions. + + */ + +PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( + PyObject *object, + const char *encoding, + const char *errors + ); +#endif + + + /* --- Codec Lookup APIs -------------------------------------------------- All APIs return a codec object with incremented refcount and are diff --git a/Lib/codecs.py b/Lib/codecs.py index 6a6eb900c72..2e2e7555a48 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE ### Codec base classes (defining the API) class CodecInfo(tuple): + """Codec details when looking up the codec registry""" + + # Private API to allow Python 3.4 to blacklist the known non-Unicode + # codecs in the standard library. A more general mechanism to + # reliably distinguish test encodings from other codecs will hopefully + # be defined for Python 3.5 + # + # See http://bugs.python.org/issue19619 + _is_text_encoding = True # Assume codecs are text encodings by default def __new__(cls, encode, decode, streamreader=None, streamwriter=None, - incrementalencoder=None, incrementaldecoder=None, name=None): + incrementalencoder=None, incrementaldecoder=None, name=None, + *, _is_text_encoding=None): self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self.name = name self.encode = encode @@ -84,6 +94,8 @@ class CodecInfo(tuple): self.incrementaldecoder = incrementaldecoder self.streamwriter = streamwriter self.streamreader = streamreader + if _is_text_encoding is not None: + self._is_text_encoding = _is_text_encoding return self def __repr__(self): diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py index 321a961782a..881d1ba0bee 100644 --- a/Lib/encodings/base64_codec.py +++ b/Lib/encodings/base64_codec.py @@ -52,4 +52,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py index e65d226bfdf..fd9495e341b 100644 --- a/Lib/encodings/bz2_codec.py +++ b/Lib/encodings/bz2_codec.py @@ -74,4 +74,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py index e003fc3ff97..f2ed0a7658e 100644 --- a/Lib/encodings/hex_codec.py +++ b/Lib/encodings/hex_codec.py @@ -52,4 +52,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py index 9243fc443b0..70f70837938 100644 --- a/Lib/encodings/quopri_codec.py +++ b/Lib/encodings/quopri_codec.py @@ -53,4 +53,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py index 3140c1432dc..fff9153b4c6 100755 --- a/Lib/encodings/rot_13.py +++ b/Lib/encodings/rot_13.py @@ -43,6 +43,7 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) ### Map diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py index 69c6f17c7f3..e3269e40cd3 100644 --- a/Lib/encodings/uu_codec.py +++ b/Lib/encodings/uu_codec.py @@ -96,4 +96,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_text_encoding=False, ) diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py index e0b9cdadbcd..4c81ca115a7 100644 --- a/Lib/encodings/zlib_codec.py +++ b/Lib/encodings/zlib_codec.py @@ -74,4 +74,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_text_encoding=False, ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 728f7d006b7..506ba7dfbfa 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -6,6 +6,7 @@ import locale import sys import unittest import warnings +import encodings from test import support @@ -2381,67 +2382,68 @@ class TransformCodecTest(unittest.TestCase): view_decoded = codecs.decode(view, encoding) self.assertEqual(view_decoded, data) - def test_type_error_for_text_input(self): + def test_text_to_binary_blacklists_binary_transforms(self): # Check binary -> binary codecs give a good error for str input bad_input = "bad input type" for encoding in bytes_transform_encodings: with self.subTest(encoding=encoding): - msg = "^encoding with '{}' codec failed".format(encoding) - with self.assertRaisesRegex(TypeError, msg) as failure: + fmt = ( "{!r} is not a text encoding; " + "use codecs.encode\(\) to handle arbitrary codecs") + msg = fmt.format(encoding) + with self.assertRaisesRegex(LookupError, msg) as failure: bad_input.encode(encoding) - self.assertTrue(isinstance(failure.exception.__cause__, - TypeError)) + self.assertIsNone(failure.exception.__cause__) - def test_type_error_for_binary_input(self): - # Check str -> str codec gives a good error for binary input - for bad_input in (b"immutable", bytearray(b"mutable")): - with self.subTest(bad_input=bad_input): - msg = "^decoding with 'rot_13' codec failed" - with self.assertRaisesRegex(AttributeError, msg) as failure: - bad_input.decode("rot_13") - self.assertTrue(isinstance(failure.exception.__cause__, - AttributeError)) + def test_text_to_binary_blacklists_text_transforms(self): + # Check str.encode gives a good error message for str -> str codecs + msg = (r"^'rot_13' is not a text encoding; " + "use codecs.encode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, msg): + "just an example message".encode("rot_13") - def test_custom_zlib_error_is_wrapped(self): - # Check zlib codec gives a good error for malformed input - msg = "^decoding with 'zlib_codec' codec failed" - with self.assertRaisesRegex(Exception, msg) as failure: - b"hello".decode("zlib_codec") - self.assertTrue(isinstance(failure.exception.__cause__, - type(failure.exception))) - - def test_custom_hex_error_is_wrapped(self): - # Check hex codec gives a good error for malformed input - msg = "^decoding with 'hex_codec' codec failed" - with self.assertRaisesRegex(Exception, msg) as failure: - b"hello".decode("hex_codec") - self.assertTrue(isinstance(failure.exception.__cause__, - type(failure.exception))) - - # Unfortunately, the bz2 module throws OSError, which the codec - # machinery currently can't wrap :( - - def test_bad_decoding_output_type(self): + def test_binary_to_text_blacklists_binary_transforms(self): # Check bytes.decode and bytearray.decode give a good error # message for binary -> binary codecs data = b"encode first to ensure we meet any format restrictions" for encoding in bytes_transform_encodings: with self.subTest(encoding=encoding): encoded_data = codecs.encode(data, encoding) - fmt = ("'{}' decoder returned 'bytes' instead of 'str'; " - "use codecs.decode\(\) to decode to arbitrary types") + fmt = (r"{!r} is not a text encoding; " + "use codecs.decode\(\) to handle arbitrary codecs") msg = fmt.format(encoding) - with self.assertRaisesRegex(TypeError, msg): + with self.assertRaisesRegex(LookupError, msg): encoded_data.decode(encoding) - with self.assertRaisesRegex(TypeError, msg): + with self.assertRaisesRegex(LookupError, msg): bytearray(encoded_data).decode(encoding) - def test_bad_encoding_output_type(self): - # Check str.encode gives a good error message for str -> str codecs - msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; " - "use codecs.encode\(\) to encode to arbitrary types") - with self.assertRaisesRegex(TypeError, msg): - "just an example message".encode("rot_13") + def test_binary_to_text_blacklists_text_transforms(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + with self.subTest(bad_input=bad_input): + msg = (r"^'rot_13' is not a text encoding; " + "use codecs.decode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, msg) as failure: + bad_input.decode("rot_13") + self.assertIsNone(failure.exception.__cause__) + + def test_custom_zlib_error_is_wrapped(self): + # Check zlib codec gives a good error for malformed input + msg = "^decoding with 'zlib_codec' codec failed" + with self.assertRaisesRegex(Exception, msg) as failure: + codecs.decode(b"hello", "zlib_codec") + self.assertIsInstance(failure.exception.__cause__, + type(failure.exception)) + + def test_custom_hex_error_is_wrapped(self): + # Check hex codec gives a good error for malformed input + msg = "^decoding with 'hex_codec' codec failed" + with self.assertRaisesRegex(Exception, msg) as failure: + codecs.decode(b"hello", "hex_codec") + self.assertIsInstance(failure.exception.__cause__, + type(failure.exception)) + + # Unfortunately, the bz2 module throws OSError, which the codec + # machinery currently can't wrap :( # The codec system tries to wrap exceptions in order to ensure the error @@ -2466,27 +2468,44 @@ class ExceptionChainingTest(unittest.TestCase): # case finishes by using the test case repr as the codec name # The codecs module normalizes codec names, although this doesn't # appear to be formally documented... - self.codec_name = repr(self).lower().replace(" ", "-") + # We also make sure we use a truly unique id for the custom codec + # to avoid issues with the codec cache when running these tests + # multiple times (e.g. when hunting for refleaks) + unique_id = repr(self) + str(id(self)) + self.codec_name = encodings.normalize_encoding(unique_id).lower() + + # We store the object to raise on the instance because of a bad + # interaction between the codec caching (which means we can't + # recreate the codec entry) and regrtest refleak hunting (which + # runs the same test instance multiple times). This means we + # need to ensure the codecs call back in to the instance to find + # out which exception to raise rather than binding them in a + # closure to an object that may change on the next run + self.obj_to_raise = RuntimeError def tearDown(self): _TEST_CODECS.pop(self.codec_name, None) - def set_codec(self, obj_to_raise): - def raise_obj(*args, **kwds): - raise obj_to_raise - codec_info = codecs.CodecInfo(raise_obj, raise_obj, + def set_codec(self, encode, decode): + codec_info = codecs.CodecInfo(encode, decode, name=self.codec_name) _TEST_CODECS[self.codec_name] = codec_info @contextlib.contextmanager def assertWrapped(self, operation, exc_type, msg): - full_msg = "{} with '{}' codec failed \({}: {}\)".format( + full_msg = r"{} with {!r} codec failed \({}: {}\)".format( operation, self.codec_name, exc_type.__name__, msg) with self.assertRaisesRegex(exc_type, full_msg) as caught: yield caught + self.assertIsInstance(caught.exception.__cause__, exc_type) + + def raise_obj(self, *args, **kwds): + # Helper to dynamically change the object raised by a test codec + raise self.obj_to_raise def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError): - self.set_codec(obj_to_raise) + self.obj_to_raise = obj_to_raise + self.set_codec(self.raise_obj, self.raise_obj) with self.assertWrapped("encoding", exc_type, msg): "str_input".encode(self.codec_name) with self.assertWrapped("encoding", exc_type, msg): @@ -2515,23 +2534,17 @@ class ExceptionChainingTest(unittest.TestCase): pass self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) - @contextlib.contextmanager - def assertNotWrapped(self, operation, exc_type, msg_re, msg=None): - if msg is None: - msg = msg_re - with self.assertRaisesRegex(exc_type, msg) as caught: - yield caught - self.assertEqual(str(caught.exception), msg) - - def check_not_wrapped(self, obj_to_raise, msg_re, msg=None): - self.set_codec(obj_to_raise) - with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg): + def check_not_wrapped(self, obj_to_raise, msg): + def raise_obj(*args, **kwds): + raise obj_to_raise + self.set_codec(raise_obj, raise_obj) + with self.assertRaisesRegex(RuntimeError, msg): "str input".encode(self.codec_name) - with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg): + with self.assertRaisesRegex(RuntimeError, msg): codecs.encode("str input", self.codec_name) - with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg): + with self.assertRaisesRegex(RuntimeError, msg): b"bytes input".decode(self.codec_name) - with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg): + with self.assertRaisesRegex(RuntimeError, msg): codecs.decode(b"bytes input", self.codec_name) def test_init_override_is_not_wrapped(self): @@ -2550,29 +2563,56 @@ class ExceptionChainingTest(unittest.TestCase): msg = "This should NOT be wrapped" exc = RuntimeError(msg) exc.attr = 1 - self.check_not_wrapped(exc, msg) + self.check_not_wrapped(exc, "^{}$".format(msg)) def test_non_str_arg_is_not_wrapped(self): self.check_not_wrapped(RuntimeError(1), "1") def test_multiple_args_is_not_wrapped(self): - msg_re = "\('a', 'b', 'c'\)" - msg = "('a', 'b', 'c')" - self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re, msg) + msg_re = r"^\('a', 'b', 'c'\)$" + self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re) # http://bugs.python.org/issue19609 def test_codec_lookup_failure_not_wrapped(self): - msg = "unknown encoding: %s" % self.codec_name + msg = "^unknown encoding: {}$".format(self.codec_name) # The initial codec lookup should not be wrapped - with self.assertNotWrapped("encoding", LookupError, msg): + with self.assertRaisesRegex(LookupError, msg): "str input".encode(self.codec_name) - with self.assertNotWrapped("encoding", LookupError, msg): + with self.assertRaisesRegex(LookupError, msg): codecs.encode("str input", self.codec_name) - with self.assertNotWrapped("decoding", LookupError, msg): + with self.assertRaisesRegex(LookupError, msg): b"bytes input".decode(self.codec_name) - with self.assertNotWrapped("decoding", LookupError, msg): + with self.assertRaisesRegex(LookupError, msg): codecs.decode(b"bytes input", self.codec_name) + def test_unflagged_non_text_codec_handling(self): + # The stdlib non-text codecs are now marked so they're + # pre-emptively skipped by the text model related methods + # However, third party codecs won't be flagged, so we still make + # sure the case where an inappropriate output type is produced is + # handled appropriately + def encode_to_str(*args, **kwds): + return "not bytes!", 0 + def decode_to_bytes(*args, **kwds): + return b"not str!", 0 + self.set_codec(encode_to_str, decode_to_bytes) + # No input or output type checks on the codecs module functions + encoded = codecs.encode(None, self.codec_name) + self.assertEqual(encoded, "not bytes!") + decoded = codecs.decode(None, self.codec_name) + self.assertEqual(decoded, b"not str!") + # Text model methods should complain + fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " + "use codecs.encode\(\) to encode to arbitrary types$") + msg = fmt.format(self.codec_name) + with self.assertRaisesRegex(TypeError, msg): + "str_input".encode(self.codec_name) + fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " + "use codecs.decode\(\) to decode to arbitrary types$") + msg = fmt.format(self.codec_name) + with self.assertRaisesRegex(TypeError, msg): + b"bytes input".decode(self.codec_name) + @unittest.skipUnless(sys.platform == 'win32', diff --git a/Misc/NEWS b/Misc/NEWS index 92d902af9cf..c33c92d1836 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,12 @@ Projected release date: 2013-11-24 Core and Builtins ----------------- +- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an + internal API to throw LookupError for known non-text encodings, rather + than attempting the encoding or decoding operation and then throwing a + TypeError for an unexpected output type. (The latter mechanism remains + in place for third party non-text encodings) + - Issue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'. Python now uses SipHash24 on all major platforms. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3644db3b137..7de5f1f40c3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3044,7 +3044,7 @@ PyUnicode_Decode(const char *s, buffer = PyMemoryView_FromBuffer(&info); if (buffer == NULL) goto onError; - unicode = PyCodec_Decode(buffer, encoding, errors); + unicode = _PyCodec_DecodeText(buffer, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -3410,7 +3410,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, } /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = _PyCodec_EncodeText(unicode, encoding, errors); if (v == NULL) return NULL; diff --git a/Python/codecs.c b/Python/codecs.c index 8fe0af7bf09..5ff41b57df2 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -353,18 +353,15 @@ wrap_codec_error(const char *operation, errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_EncodeInternal(PyObject *object, + PyObject *encoder, + const char *encoding, + const char *errors) { - PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v = NULL; - encoder = PyCodec_Encoder(encoding); - if (encoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -402,18 +399,15 @@ PyObject *PyCodec_Encode(PyObject *object, errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_DecodeInternal(PyObject *object, + PyObject *decoder, + const char *encoding, + const char *errors) { - PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - decoder = PyCodec_Decoder(encoding); - if (decoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -445,6 +439,118 @@ PyObject *PyCodec_Decode(PyObject *object, return NULL; } +/* Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = PyCodec_Encoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = PyCodec_Decoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + +/* Text encoding/decoding API */ +static +PyObject *codec_getitem_checked(const char *encoding, + const char *operation_name, + int index) +{ + _Py_IDENTIFIER(_is_text_encoding); + PyObject *codec; + PyObject *attr; + PyObject *v; + int is_text_codec; + + codec = _PyCodec_Lookup(encoding); + if (codec == NULL) + return NULL; + + /* Backwards compatibility: assume any raw tuple describes a text + * encoding, and the same for anything lacking the private + * attribute. + */ + if (!PyTuple_CheckExact(codec)) { + attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding); + if (attr == NULL) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_Clear(); + } else { + Py_DECREF(codec); + return NULL; + } + } else { + is_text_codec = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (!is_text_codec) { + Py_DECREF(codec); + PyErr_Format(PyExc_LookupError, + "'%.400s' is not a text encoding; " + "use codecs.%s() to handle arbitrary codecs", + encoding, operation_name); + return NULL; + } + } + } + + v = PyTuple_GET_ITEM(codec, index); + Py_DECREF(codec); + Py_INCREF(v); + return v; +} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "encode", 0); +} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "decode", 1); +} + +PyObject *_PyCodec_EncodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = _PyCodec_TextEncoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *_PyCodec_DecodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = _PyCodec_TextDecoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the