Latin-1 source code was not being properly decoded when passed through

compile(). This was due to left-over special-casing before UTF-8 became the default source encoding. Closes issue #3574. Thanks to Victor Stinner for help with the patch.
2024-11-28 20:33:54 +08:00 · 2008-10-17 03:38:50 +00:00 · 2008-10-17 03:38:50 +00:00 · da78043237
commit da78043237
parent 9e9dcd6d42
5 changed files with 24 additions and 10 deletions
--- a/Lib/test/test_pep3120.py
+++ b/Lib/test/test_pep3120.py
@ -23,8 +23,24 @@ class PEP3120Test(unittest.TestCase):
        else:
            self.fail("expected exception didn't occur")

+
+class BuiltinCompileTests(unittest.TestCase):
+
+    # Issue 3574.
+    def test_latin1(self):
+        # Allow compile() to read Latin-1 source.
+        source_code = '# coding: Latin-1\nu = "Ç"\n'.encode("Latin-1")
+        try:
+            code = compile(source_code, '<dummy>', 'exec')
+        except SyntaxError:
+            self.fail("compile() cannot handle Latin-1 source")
+        ns = {}
+        exec(code, ns)
+        self.assertEqual('Ç', ns['u'])
+
+
 def test_main():
-    support.run_unittest(PEP3120Test)
+    support.run_unittest(PEP3120Test, BuiltinCompileTests)

 if __name__=="__main__":
    test_main()
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -15,6 +15,8 @@ What's New in Python 3.0 beta 5
 Core and Builtins
 -----------------

+- Issue #3574: compile() incorrectly handled source code encoded as Latin-1.
+
 - Issues #2384 and #3975: Tracebacks were not correctly printed when the
  source file contains a ``coding:`` header: the wrong line was displayed, and
  the encoding was not respected.
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -135,6 +135,7 @@ tok_new(void)
 	tok->decoding_state = STATE_INIT;
 	tok->decoding_erred = 0;
 	tok->read_coding_spec = 0;
+	tok->enc = NULL;
 	tok->encoding = NULL;
        tok->cont_line = 0;
 #ifndef PGEN
@ -274,8 +275,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 		tok->read_coding_spec = 1;
 		if (tok->encoding == NULL) {
 			assert(tok->decoding_state == STATE_RAW);
-			if (strcmp(cs, "utf-8") == 0 ||
-			    strcmp(cs, "iso-8859-1") == 0) {
+			if (strcmp(cs, "utf-8") == 0) {
 				tok->encoding = cs;
 			} else {
 				r = set_readline(tok, cs);
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -49,14 +49,14 @@ struct tok_state {
 	enum decoding_state decoding_state;
 	int decoding_erred;	/* whether erred in decoding  */
 	int read_coding_spec;	/* whether 'coding:...' has been read  */
-	char *encoding;
+	char *encoding;         /* Source encoding. */
 	int cont_line;          /* whether we are in a continuation line. */
 	const char* line_start;	/* pointer to start of current line */
 #ifndef PGEN
 	PyObject *decoding_readline; /* codecs.open(...).readline */
 	PyObject *decoding_buffer;
 #endif
-	const char* enc;
+	const char* enc;        /* Encoding for the current str. */
 	const char* str;
 };

--- a/Python/ast.c
+++ b/Python/ast.c
@ -3160,9 +3160,6 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
    if (encoding == NULL) {
        buf = (char *)s;
        u = NULL;
-    } else if (strcmp(encoding, "iso-8859-1") == 0) {
-        buf = (char *)s;
-        u = NULL;
    } else {
        /* check for integer overflow */
        if (len > PY_SIZE_MAX / 4)
@ -3275,8 +3272,7 @@ parsestr(struct compiling *c, const node *n, int *bytesmode)
        }
    }
    need_encoding = (!*bytesmode && c->c_encoding != NULL &&
-                     strcmp(c->c_encoding, "utf-8") != 0 &&
-                     strcmp(c->c_encoding, "iso-8859-1") != 0);
+                     strcmp(c->c_encoding, "utf-8") != 0);
    if (rawmode || strchr(s, '\\') == NULL) {
        if (need_encoding) {
            PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);