gh-96268: Fix loading invalid UTF-8 (GH-96270)

This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8. It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8. (cherry picked from commit 8bc356a7dd) Co-authored-by: Michael Droettboom <mdboom@gmail.com>
2025-10-09 16:34:44 +00:00 · 2022-09-07 14:49:17 -07:00 · 2022-09-07 14:49:17 -07:00 · ffafa9b91d
commit ffafa9b91d
parent 9fa21d050a
3 changed files with 57 additions and 16 deletions
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@ -248,8 +248,10 @@ class UTF8ValidatorTest(unittest.TestCase):
        # test it is to write actual files to disk.
        # Each example is put inside a string at the top of the file so
-        # it's an otherwise valid Python source file.
+        # it's an otherwise valid Python source file. Put some newlines
-        template = b'"%s"\n'
+        # beforehand so we can assert that the error is reported on the
        # correct line.
        template = b'\n\n\n"%s"\n'
        fn = TESTFN
        self.addCleanup(unlink, fn)
@ -257,7 +259,12 @@ class UTF8ValidatorTest(unittest.TestCase):
        def check(content):
            with open(fn, 'wb') as fp:
                fp.write(template % content)
-            script_helper.assert_python_failure(fn)
+            rc, stdout, stderr = script_helper.assert_python_failure(fn)
            # We want to assert that the python subprocess failed gracefully,
            # not via a signal.
            self.assertGreaterEqual(rc, 1)
            self.assertIn(b"Non-UTF-8 code starting with", stderr)
            self.assertIn(b"on line 4", stderr)
        # continuation bytes in a sequence of 2, 3, or 4 bytes
        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
--- a/Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
+++ b/Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
@ -0,0 +1,2 @@
 Loading a file with invalid UTF-8 will now report the broken character at
 the correct location.
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -486,25 +486,59 @@ static void fp_ungetc(int c, struct tok_state *tok) {
 /* Check whether the characters at s start a valid
   UTF-8 sequence. Return the number of characters forming
-   the sequence if yes, 0 if not.  */
+   the sequence if yes, 0 if not.  The special cases match
-static int valid_utf8(const unsigned char* s)
+   those in stringlib/codecs.h:utf8_decode.
 */
 static int
 valid_utf8(const unsigned char* s)
 {
    int expected = 0;
    int length;
-    if (*s < 0x80)
+    if (*s < 0x80) {
        /* single-byte code */
        return 1;
-    if (*s < 0xc0)
+    }
-        /* following byte */
+    else if (*s < 0xE0) {
-        return 0;
+        /* \xC2\x80-\xDF\xBF -- 0080-07FF */
-    if (*s < 0xE0)
+        if (*s < 0xC2) {
            /* invalid sequence
               \x80-\xBF -- continuation byte
               \xC0-\xC1 -- fake 0000-007F */
            return 0;
        }
        expected = 1;
-    else if (*s < 0xF0)
+    }
    else if (*s < 0xF0) {
        /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
        if (*s == 0xE0 && *(s + 1) < 0xA0) {
            /* invalid sequence
               \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
            return 0;
        }
        else if (*s == 0xED && *(s + 1) >= 0xA0) {
            /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
               will result in surrogates in range D800-DFFF. Surrogates are
               not valid UTF-8 so they are rejected.
               See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
            return 0;
        }
        expected = 2;
-    else if (*s < 0xF8)
+    }
    else if (*s < 0xF5) {
        /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
        if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
            /* invalid sequence -- one of:
               \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
               \xF4\x90\x80\x80- -- 110000- overflow */
            return 0;
        }
        expected = 3;
-    else
+    }
    else {
        /* invalid start byte */
        return 0;
    }
    length = expected + 1;
    for (; expected; expected--)
        if (s[expected] < 0x80 || s[expected] >= 0xC0)
@ -525,14 +559,12 @@ ensure_utf8(char *line, struct tok_state *tok)
        }
    }
    if (badchar) {
        /* Need to add 1 to the line number, since this line
       has not been counted, yet.  */
        PyErr_Format(PyExc_SyntaxError,
                     "Non-UTF-8 code starting with '\\x%.2x' "
                     "in file %U on line %i, "
                     "but no encoding declared; "
                     "see https://peps.python.org/pep-0263/ for details",
-                     badchar, tok->filename, tok->lineno + 1);
+                     badchar, tok->filename, tok->lineno);
        return 0;
    }
    return 1;
		`@ -0,0 +1,2 @@`
							`Loading a file with invalid UTF-8 will now report the broken character at`
							`the correct location.`