From 5646648678295a44aa82636c6e92826651baf33a Mon Sep 17 00:00:00 2001 From: "Eric V. Smith" Date: Mon, 31 Oct 2016 14:46:26 -0400 Subject: [PATCH] Issue 28128: Print out better error/warning messages for invalid string escapes. Backport to 3.6. --- Include/bytesobject.h | 5 +++ Include/unicodeobject.h | 11 ++++++ Lib/test/test_string_literals.py | 27 +++++++++++++ Lib/test/test_unicode.py | 7 ---- Misc/NEWS | 4 ++ Objects/bytesobject.c | 37 ++++++++++++++++-- Objects/unicodeobject.c | 38 +++++++++++++++--- Python/ast.c | 66 +++++++++++++++++++++++++++++--- 8 files changed, 173 insertions(+), 22 deletions(-) diff --git a/Include/bytesobject.h b/Include/bytesobject.h index 11d8218402d..98e29b68791 100644 --- a/Include/bytesobject.h +++ b/Include/bytesobject.h @@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex( PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, const char *, Py_ssize_t, const char *); +/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */ +PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t, + const char *, Py_ssize_t, + const char *, + const char **); /* Macro, trading safety for speed */ #ifndef Py_LIMITED_API diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 5711de0164e..b5ef3e41303 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( const char *errors /* error handling */ ); +/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape + chars. */ +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( + const char *string, /* Unicode-Escape encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + const char **first_invalid_escape /* on return, points to first + invalid escaped char in + string. */ +); + PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( PyObject *unicode /* Unicode object */ ); diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py index 37ace230f5e..54f2be35983 100644 --- a/Lib/test/test_string_literals.py +++ b/Lib/test/test_string_literals.py @@ -31,6 +31,7 @@ import os import sys import shutil import tempfile +import warnings import unittest @@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, r""" '\U000000' """) self.assertRaises(SyntaxError, eval, r""" '\U0000000' """) + def test_eval_str_invalid_escape(self): + for b in range(1, 128): + if b in b"""\n\r"'01234567NU\\abfnrtuvx""": + continue + with self.assertWarns(DeprecationWarning): + self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b)) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always', category=DeprecationWarning) + eval("'''\n\\z'''") + self.assertEqual(len(w), 1) + self.assertEqual(w[0].filename, '') + self.assertEqual(w[0].lineno, 2) + def test_eval_str_raw(self): self.assertEqual(eval(""" r'x' """), 'x') self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01') @@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, r""" b'\x' """) self.assertRaises(SyntaxError, eval, r""" b'\x0' """) + def test_eval_bytes_invalid_escape(self): + for b in range(1, 128): + if b in b"""\n\r"'01234567\\abfnrtvx""": + continue + with self.assertWarns(DeprecationWarning): + self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b])) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always', category=DeprecationWarning) + eval("b'''\n\\z'''") + self.assertEqual(len(w), 1) + self.assertEqual(w[0].filename, '') + self.assertEqual(w[0].lineno, 2) + def test_eval_bytes_raw(self): self.assertEqual(eval(""" br'x' """), b'x') self.assertEqual(eval(""" rb'x' """), b'x') diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index fe6cd28c63f..0737140ccfa 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTest, support.check_free_after_iterating(self, iter, str) support.check_free_after_iterating(self, reversed, str) - def test_invalid_sequences(self): - for letter in string.ascii_letters + "89": # 0-7 are octal escapes - if letter in "abfnrtuvxNU": - continue - with self.assertWarns(DeprecationWarning): - eval(r"'\%s'" % letter) - class CAPITest(unittest.TestCase): diff --git a/Misc/NEWS b/Misc/NEWS index a621be316f8..c9bfe017bd9 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,10 @@ What's New in Python 3.6.0 beta 3 Core and Builtins ----------------- +- Issue #28128: Deprecation warning for invalid str and byte escape + sequences now prints better information about where the error + occurs. Patch by Serhiy Storchaka and Eric Smith. + - Issue #28509: dict.update() no longer allocate unnecessary large memory. - Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 598f6a13cfc..779fe295db0 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1105,11 +1105,12 @@ _PyBytes_DecodeEscapeRecode(const char **s, const char *end, return p; } -PyObject *PyBytes_DecodeEscape(const char *s, +PyObject *_PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, Py_ssize_t unicode, - const char *recode_encoding) + const char *recode_encoding, + const char **first_invalid_escape) { int c; char *p; @@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const char *s, return NULL; writer.overallocate = 1; + *first_invalid_escape = NULL; + end = s + len; while (s < end) { if (*s != '\\') { @@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const char *s, break; default: - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0) - goto failed; + if (*first_invalid_escape == NULL) { + *first_invalid_escape = s-1; /* Back up one char, since we've + already incremented s. */ + } *p++ = '\\'; + s--; goto non_esc; /* an arbitrary number of unescaped UTF-8 bytes may follow. */ } @@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const char *s, return NULL; } +PyObject *PyBytes_DecodeEscape(const char *s, + Py_ssize_t len, + const char *errors, + Py_ssize_t unicode, + const char *recode_encoding) +{ + const char* first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode, + recode_encoding, + &first_invalid_escape); + if (result == NULL) + return NULL; + if (first_invalid_escape != NULL) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "invalid escape sequence '\\%c'", + *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; + +} /* -------------------------------------------------------------------- */ /* object api */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e45f3d7c27b..50b21cf9e65 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5896,9 +5896,10 @@ PyUnicode_AsUTF16String(PyObject *unicode) static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors, + const char **first_invalid_escape) { const char *starts = s; _PyUnicodeWriter writer; @@ -5906,6 +5907,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s, PyObject *errorHandler = NULL; PyObject *exc = NULL; + // so we can remember if we've seen an invalid escape char or not + *first_invalid_escape = NULL; + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); } @@ -6080,9 +6084,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s, goto error; default: - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "invalid escape sequence '\\%c'", c) < 0) - goto onError; + if (*first_invalid_escape == NULL) { + *first_invalid_escape = s-1; /* Back up one char, since we've + already incremented s. */ + } WRITE_ASCII_CHAR('\\'); WRITE_CHAR(c); continue; @@ -6117,6 +6122,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s, return NULL; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + const char *first_invalid_escape; + PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, + &first_invalid_escape); + if (result == NULL) + return NULL; + if (first_invalid_escape != NULL) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "invalid escape sequence '\\%c'", + *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; +} + /* Return a Unicode-Escape string version of the Unicode object. If quotes is true, the string is enclosed in u"" or u'' quotes as diff --git a/Python/ast.c b/Python/ast.c index 76daf6f4462..91e7d0129f4 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const char **sPtr, const char *end) return PyUnicode_DecodeUTF8(t, s - t, NULL); } +static int +warn_invalid_escape_sequence(struct compiling *c, const node *n, + char first_invalid_escape_char) +{ + PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c", + first_invalid_escape_char); + if (msg == NULL) { + return -1; + } + if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, + c->c_filename, LINENO(n), + NULL, NULL) < 0 && + PyErr_ExceptionMatches(PyExc_DeprecationWarning)) + { + const char *s = PyUnicode_AsUTF8(msg); + if (s != NULL) { + ast_error(c, n, s); + } + Py_DECREF(msg); + return -1; + } + Py_DECREF(msg); + return 0; +} + static PyObject * -decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len) +decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s, + size_t len) { PyObject *v, *u; char *buf; @@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len) len = p - buf; s = buf; - v = PyUnicode_DecodeUnicodeEscape(s, len, NULL); + const char *first_invalid_escape; + v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape); + + if (v != NULL && first_invalid_escape != NULL) { + if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) { + /* We have not decref u before because first_invalid_escape points + inside u. */ + Py_XDECREF(u); + Py_DECREF(v); + return NULL; + } + } Py_XDECREF(u); return v; } +static PyObject * +decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s, + size_t len) +{ + const char *first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL, + &first_invalid_escape); + if (result == NULL) + return NULL; + + if (first_invalid_escape != NULL) { + if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; +} + /* Compile this expression in to an expr_ty. Add parens around the expression, in order to allow leading spaces in the expression. */ static expr_ty @@ -4310,7 +4366,7 @@ done: literal_end-literal_start, NULL, NULL); else - *literal = decode_unicode_with_escapes(c, literal_start, + *literal = decode_unicode_with_escapes(c, n, literal_start, literal_end-literal_start); if (!*literal) return -1; @@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node *n, int *bytesmode, int *rawmode, if (*rawmode) *result = PyBytes_FromStringAndSize(s, len); else - *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL); + *result = decode_bytes_with_escapes(c, n, s, len); } else { if (*rawmode) *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); else - *result = decode_unicode_with_escapes(c, s, len); + *result = decode_unicode_with_escapes(c, n, s, len); } return *result == NULL ? -1 : 0; }