bpo-40334: Improve column offsets for thrown syntax errors by Pegen (GH-19782)

2024-11-23 18:04:37 +08:00 · 2020-05-01 16:13:43 +03:00 · 2020-05-01 16:13:43 +03:00 · 76c1b4d5c5
commit 76c1b4d5c5
parent 719e14d283
6 changed files with 80 additions and 112 deletions
--- a/Grammar/python.gram
+++ b/Grammar/python.gram
@ -609,7 +609,7 @@ invalid_assignment:
    | expression ':' expression ['=' annotated_rhs] {
        RAISE_SYNTAX_ERROR("illegal target for annotation") }
    | a=expression ('=' | augassign) (yield_expr | star_expressions) {
-        RAISE_SYNTAX_ERROR("cannot assign to %s", _PyPegen_get_expr_name(a)) }
+        RAISE_SYNTAX_ERROR_NO_COL_OFFSET("cannot assign to %s", _PyPegen_get_expr_name(a)) }
 invalid_block:
    | NEWLINE !INDENT { RAISE_INDENTATION_ERROR("expected an indented block") }
 invalid_comprehension:
--- a/Lib/test/test_cmd_line_script.py
+++ b/Lib/test/test_cmd_line_script.py
@ -599,7 +599,7 @@ class CmdLineTest(unittest.TestCase):
            exitcode, stdout, stderr = assert_python_failure(script_name)
            text = io.TextIOWrapper(io.BytesIO(stderr), 'ascii').read()
            # Confirm that the caret is located under the first 1 character
-            self.assertIn("\n    1 + 1 = 2\n            ^", text)
+            self.assertIn("\n    1 + 1 = 2\n    ^", text)

    def test_syntaxerror_indented_caret_position(self):
        script = textwrap.dedent("""\
@ -611,7 +611,7 @@ class CmdLineTest(unittest.TestCase):
            exitcode, stdout, stderr = assert_python_failure(script_name)
            text = io.TextIOWrapper(io.BytesIO(stderr), 'ascii').read()
            # Confirm that the caret is located under the first 1 character
-            self.assertIn("\n    1 + 1 = 2\n            ^", text)
+            self.assertIn("\n    1 + 1 = 2\n    ^", text)

            # Try the same with a form feed at the start of the indented line
            script = (
@ -622,7 +622,7 @@ class CmdLineTest(unittest.TestCase):
            exitcode, stdout, stderr = assert_python_failure(script_name)
            text = io.TextIOWrapper(io.BytesIO(stderr), "ascii").read()
            self.assertNotIn("\f", text)
-            self.assertIn("\n    1 + 1 = 2\n            ^", text)
+            self.assertIn("\n    1 + 1 = 2\n    ^", text)

    def test_syntaxerror_multi_line_fstring(self):
        script = 'foo = f"""{}\nfoo"""\n'
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@ -178,19 +178,19 @@ class ExceptionTests(unittest.TestCase):
        s = '''if True:\n        print()\n\texec "mixed tabs and spaces"'''
        ckmsg(s, "inconsistent use of tabs and spaces in indentation", TabError)

-    @support.skip_if_new_parser("Pegen column offsets might be different")
-    def testSyntaxErrorOffset(self):
-        def check(src, lineno, offset, encoding='utf-8'):
-            with self.assertRaises(SyntaxError) as cm:
-                compile(src, '<fragment>', 'exec')
-            self.assertEqual(cm.exception.lineno, lineno)
-            self.assertEqual(cm.exception.offset, offset)
-            if cm.exception.text is not None:
-                if not isinstance(src, str):
-                    src = src.decode(encoding, 'replace')
-                line = src.split('\n')[lineno-1]
-                self.assertIn(line, cm.exception.text)
+    def check(self, src, lineno, offset, encoding='utf-8'):
+        with self.assertRaises(SyntaxError) as cm:
+            compile(src, '<fragment>', 'exec')
+        self.assertEqual(cm.exception.lineno, lineno)
+        self.assertEqual(cm.exception.offset, offset)
+        if cm.exception.text is not None:
+            if not isinstance(src, str):
+                src = src.decode(encoding, 'replace')
+            line = src.split('\n')[lineno-1]
+            self.assertIn(line, cm.exception.text)

+    def testSyntaxErrorOffset(self):
+        check = self.check
        check('def fact(x):\n\treturn x!\n', 2, 10)
        check('1 +\n', 1, 4)
        check('def spam():\n  print(1)\n print(2)', 3, 10)
@ -238,20 +238,20 @@ class ExceptionTests(unittest.TestCase):
        check('nonlocal x', 1, 1)
        check('def f():\n  global x\n  nonlocal x', 2, 3)

-        # Errors thrown by ast.c
-        check('for 1 in []: pass', 1, 5)
-        check('def f(*):\n  pass', 1, 7)
-        check('[*x for x in xs]', 1, 2)
-        check('def f():\n  x, y: int', 2, 3)
-        check('(yield i) = 2', 1, 1)
-        check('foo(x for x in range(10), 100)', 1, 5)
-        check('foo(1=2)', 1, 5)
-
        # Errors thrown by future.c
        check('from __future__ import doesnt_exist', 1, 1)
        check('from __future__ import braces', 1, 1)
        check('x=1\nfrom __future__ import division', 2, 1)

+    @support.skip_if_new_parser("Pegen column offsets might be different")
+    def testSyntaxErrorOffsetCustom(self):
+        self.check('for 1 in []: pass', 1, 5)
+        self.check('def f(*):\n  pass', 1, 7)
+        self.check('[*x for x in xs]', 1, 2)
+        self.check('def f():\n  x, y: int', 2, 3)
+        self.check('(yield i) = 2', 1, 1)
+        self.check('foo(x for x in range(10), 100)', 1, 5)
+        self.check('foo(1=2)', 1, 5)

    @cpython_only
    def testSettingException(self):
--- a/Parser/pegen/parse.c
+++ b/Parser/pegen/parse.c
@ -10515,7 +10515,7 @@ invalid_assignment_rule(Parser *p)
            (_tmp_132_var = _tmp_132_rule(p))
        )
        {
-            res = RAISE_SYNTAX_ERROR ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) );
+            res = RAISE_SYNTAX_ERROR_NO_COL_OFFSET ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) );
            if (res == NULL && PyErr_Occurred()) {
                p->error_indicator = 1;
                return NULL;
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@ -145,11 +145,15 @@ byte_offset_to_character_offset(PyObject *line, int col_offset)
    if (!str) {
        return 0;
    }
-    PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL);
+    PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
    if (!text) {
        return 0;
    }
    Py_ssize_t size = PyUnicode_GET_LENGTH(text);
+    str = PyUnicode_AsUTF8(text);
+    if (str != NULL && (int)strlen(str) == col_offset) {
+        size = strlen(str);
+    }
    Py_DECREF(text);
    return size;
 }
@ -297,68 +301,23 @@ error:
 }

 static inline PyObject *
-get_error_line(char *buffer)
+get_error_line(char *buffer, int is_file)
 {
-    char *newline = strchr(buffer, '\n');
+    const char *newline;
+    if (is_file) {
+        newline = strrchr(buffer, '\n');
+    } else {
+        newline = strchr(buffer, '\n');
+    }
+
    if (newline) {
-        return PyUnicode_FromStringAndSize(buffer, newline - buffer);
+        return PyUnicode_DecodeUTF8(buffer, newline - buffer, "replace");
    }
    else {
-        return PyUnicode_FromString(buffer);
+        return PyUnicode_DecodeUTF8(buffer, strlen(buffer), "replace");
    }
 }

-static int
-tokenizer_error_with_col_offset(Parser *p, PyObject *errtype, const char *errmsg)
-{
-    PyObject *errstr = NULL;
-    PyObject *value = NULL;
-    size_t col_number = -1;
-
-    errstr = PyUnicode_FromString(errmsg);
-    if (!errstr) {
-        return -1;
-    }
-
-    PyObject *loc = NULL;
-    if (p->start_rule == Py_file_input) {
-        loc = PyErr_ProgramTextObject(p->tok->filename, p->tok->lineno);
-    }
-    if (!loc) {
-        loc = get_error_line(p->tok->buf);
-    }
-
-    if (loc) {
-        col_number = p->tok->cur - p->tok->buf;
-    }
-    else {
-        Py_INCREF(Py_None);
-        loc = Py_None;
-    }
-
-    PyObject *tmp = Py_BuildValue("(OiiN)", p->tok->filename, p->tok->lineno,
-                                  col_number, loc);
-    if (!tmp) {
-        goto error;
-    }
-
-    value = PyTuple_Pack(2, errstr, tmp);
-    Py_DECREF(tmp);
-    if (!value) {
-        goto error;
-    }
-    PyErr_SetObject(errtype, value);
-
-    Py_XDECREF(value);
-    Py_XDECREF(errstr);
-    return -1;
-
-error:
-    Py_XDECREF(errstr);
-    Py_XDECREF(loc);
-    return -1;
-}
-
 static int
 tokenizer_error(Parser *p)
 {
@ -376,20 +335,20 @@ tokenizer_error(Parser *p)
            msg = "invalid character in identifier";
            break;
        case E_BADPREFIX:
-            return tokenizer_error_with_col_offset(p,
-                errtype, "invalid string prefix");
+            RAISE_SYNTAX_ERROR("invalid string prefix");
+            return -1;
        case E_EOFS:
-            return tokenizer_error_with_col_offset(p,
-                errtype, "EOF while scanning triple-quoted string literal");
+            RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
+            return -1;
        case E_EOLS:
-            return tokenizer_error_with_col_offset(p,
-                errtype, "EOL while scanning string literal");
+            RAISE_SYNTAX_ERROR("EOL while scanning string literal");
+            return -1;
        case E_EOF:
-            return tokenizer_error_with_col_offset(p,
-                errtype, "unexpected EOF while parsing");
+            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
+            return -1;
        case E_DEDENT:
-            return tokenizer_error_with_col_offset(p,
-                PyExc_IndentationError, "unindent does not match any outer indentation level");
+            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
+            return -1;
        case E_INTR:
            if (!PyErr_Occurred()) {
                PyErr_SetNone(PyExc_KeyboardInterrupt);
@ -421,14 +380,14 @@ tokenizer_error(Parser *p)
 }

 void *
-_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
+_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...)
 {
    PyObject *value = NULL;
    PyObject *errstr = NULL;
    PyObject *loc = NULL;
    PyObject *tmp = NULL;
    Token *t = p->tokens[p->fill - 1];
-    Py_ssize_t col_number = 0;
+    Py_ssize_t col_number = !with_col_number;
    va_list va;

    va_start(va, errmsg);
@ -443,14 +402,20 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
    }

    if (!loc) {
-        loc = get_error_line(p->tok->buf);
+        loc = get_error_line(p->tok->buf, p->start_rule == Py_file_input);
    }

-    if (loc) {
-        int col_offset = t->col_offset == -1 ? 0 : t->col_offset;
-        col_number = byte_offset_to_character_offset(loc, col_offset) + 1;
+    if (loc && with_col_number) {
+        int col_offset;
+        if (t->col_offset == -1) {
+            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - p->tok->buf,
+                                          intptr_t, int);
+        } else {
+            col_offset = t->col_offset + 1;
+        }
+        col_number = byte_offset_to_character_offset(loc, col_offset);
    }
-    else {
+    else if (!loc) {
        Py_INCREF(Py_None);
        loc = Py_None;
    }
@ -632,14 +597,6 @@ _PyPegen_fill_token(Parser *p)
        type = PyTokenizer_Get(p->tok, &start, &end);
    }

-    if (type == ERRORTOKEN) {
-        if (p->tok->done == E_DECODE) {
-            return raise_decode_error(p);
-        }
-        else {
-            return tokenizer_error(p);
-        }
-    }
    if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) {
        type = NEWLINE; /* Add an extra newline */
        p->parsing_started = 0;
@ -700,6 +657,16 @@ _PyPegen_fill_token(Parser *p)
    t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;

    p->fill += 1;
+
+    if (type == ERRORTOKEN) {
+        if (p->tok->done == E_DECODE) {
+            return raise_decode_error(p);
+        }
+        else {
+            return tokenizer_error(p);
+        }
+    }
+
    return 0;
 }

--- a/Parser/pegen/pegen.h
+++ b/Parser/pegen/pegen.h
@ -126,14 +126,15 @@ expr_ty _PyPegen_name_token(Parser *p);
 expr_ty _PyPegen_number_token(Parser *p);
 void *_PyPegen_string_token(Parser *p);
 const char *_PyPegen_get_expr_name(expr_ty);
-void *_PyPegen_raise_error(Parser *p, PyObject *, const char *errmsg, ...);
+void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...);
 void *_PyPegen_dummy_name(Parser *p, ...);

 #define UNUSED(expr) do { (void)(expr); } while (0)
 #define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena
 #define EXTRA start_lineno, start_col_offset, end_lineno, end_col_offset, p->arena
-#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__)
-#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__)
+#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__)
+#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 1, msg, ##__VA_ARGS__)
+#define RAISE_SYNTAX_ERROR_NO_COL_OFFSET(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__)

 Py_LOCAL_INLINE(void *)
 CHECK_CALL(Parser *p, void *result)
@ -190,8 +191,8 @@ INVALID_VERSION_CHECK(Parser *p, int version, char *msg, void *node)
    }
    if (p->feature_version < version) {
        p->error_indicator = 1;
-        return _PyPegen_raise_error(p, PyExc_SyntaxError, "%s only supported in Python 3.%i and greater",
-                                    msg, version);
+        return RAISE_SYNTAX_ERROR("%s only supported in Python 3.%i and greater",
+                                  msg, version);
    }
    return node;
 }