bpo-46054: Fix parsing error when parsing non-utf8 characters in source files (GH-30068)

2025-10-10 00:43:41 +00:00 · 2021-12-12 07:06:50 +00:00 · 2021-12-12 07:06:50 +00:00 · 4325a766f5
commit 4325a766f5
parent 59435eea08
3 changed files with 19 additions and 8 deletions
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@ -2387,6 +2387,18 @@ class SyntaxErrorTests(unittest.TestCase):
        finally:
            unlink(TESTFN)
    def test_non_utf8(self):
        # Check non utf-8 characters
        try:
            with open(TESTFN, 'bw') as testfile:
                testfile.write(b'\x7fELF\x02\x01\x01\x00\x00\x00')
            rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
            err = err.decode('utf-8').splitlines()
            self.assertEqual(err[-1], "SyntaxError: invalid non-printable character U+007F")
        finally:
            unlink(TESTFN)
    def test_attributes_new_constructor(self):
        args = ("bad.py", 1, 2, "abcdefg", 1, 100)
        the_exception = SyntaxError("bad bad", args)
--- a/Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst
+++ b/Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst
@ -0,0 +1,2 @@
 Fix parser error when parsing non-utf8 characters in source files. Patch by
 Pablo Galindo.
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -819,10 +819,10 @@ tok_readline_raw(struct tok_state *tok)
            tok_concatenate_interactive_new_line(tok, line) == -1) {
            return 0;
        }
-        if (*tok->inp == '\0') {
+        tok->inp = strchr(tok->inp, '\0');
        if (tok->inp == tok->buf) {
            return 0;
        }
        tok->inp = strchr(tok->inp, '\0');
    } while (tok->inp[-1] != '\n');
    return 1;
 }
@ -984,13 +984,10 @@ tok_underflow_file(struct tok_state *tok) {
    }
    /* The default encoding is UTF-8, so make sure we don't have any
       non-UTF-8 sequences in it. */
-    if (!tok->encoding
+    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
        && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
        if (!ensure_utf8(tok->cur, tok)) {
        error_ret(tok);
        return 0;
    }
    }
    assert(tok->done == E_OK);
    return tok->done == E_OK;
 }
		`@ -0,0 +1,2 @@`
							`Fix parser error when parsing non-utf8 characters in source files. Patch by`
							`Pablo Galindo.`