mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
gh-96268: Fix loading invalid UTF-8 (GH-96270)
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8.
It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8.
(cherry picked from commit 8bc356a7dd
)
Co-authored-by: Michael Droettboom <mdboom@gmail.com>
This commit is contained in:
parent
9fa21d050a
commit
ffafa9b91d
3 changed files with 57 additions and 16 deletions
|
@ -248,8 +248,10 @@ class UTF8ValidatorTest(unittest.TestCase):
|
|||
# test it is to write actual files to disk.
|
||||
|
||||
# Each example is put inside a string at the top of the file so
|
||||
# it's an otherwise valid Python source file.
|
||||
template = b'"%s"\n'
|
||||
# it's an otherwise valid Python source file. Put some newlines
|
||||
# beforehand so we can assert that the error is reported on the
|
||||
# correct line.
|
||||
template = b'\n\n\n"%s"\n'
|
||||
|
||||
fn = TESTFN
|
||||
self.addCleanup(unlink, fn)
|
||||
|
@ -257,7 +259,12 @@ class UTF8ValidatorTest(unittest.TestCase):
|
|||
def check(content):
|
||||
with open(fn, 'wb') as fp:
|
||||
fp.write(template % content)
|
||||
script_helper.assert_python_failure(fn)
|
||||
rc, stdout, stderr = script_helper.assert_python_failure(fn)
|
||||
# We want to assert that the python subprocess failed gracefully,
|
||||
# not via a signal.
|
||||
self.assertGreaterEqual(rc, 1)
|
||||
self.assertIn(b"Non-UTF-8 code starting with", stderr)
|
||||
self.assertIn(b"on line 4", stderr)
|
||||
|
||||
# continuation bytes in a sequence of 2, 3, or 4 bytes
|
||||
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue