gh-96268: Fix loading invalid UTF-8 (GH-96270)

This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8. It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8. (cherry picked from commit 8bc356a7dd) Co-authored-by: Michael Droettboom <mdboom@gmail.com>
2025-10-06 23:21:06 +00:00 · 2022-09-07 14:49:17 -07:00 · 2022-09-07 14:49:17 -07:00 · ffafa9b91d
commit ffafa9b91d
parent 9fa21d050a
3 changed files with 57 additions and 16 deletions
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@ -248,8 +248,10 @@ class UTF8ValidatorTest(unittest.TestCase):
        # test it is to write actual files to disk.

        # Each example is put inside a string at the top of the file so
-        # it's an otherwise valid Python source file.
-        template = b'"%s"\n'
+        # it's an otherwise valid Python source file. Put some newlines
+        # beforehand so we can assert that the error is reported on the
+        # correct line.
+        template = b'\n\n\n"%s"\n'

        fn = TESTFN
        self.addCleanup(unlink, fn)
@ -257,7 +259,12 @@ class UTF8ValidatorTest(unittest.TestCase):
        def check(content):
            with open(fn, 'wb') as fp:
                fp.write(template % content)
-            script_helper.assert_python_failure(fn)
+            rc, stdout, stderr = script_helper.assert_python_failure(fn)
+            # We want to assert that the python subprocess failed gracefully,
+            # not via a signal.
+            self.assertGreaterEqual(rc, 1)
+            self.assertIn(b"Non-UTF-8 code starting with", stderr)
+            self.assertIn(b"on line 4", stderr)

        # continuation bytes in a sequence of 2, 3, or 4 bytes
        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]