mirror of
https://github.com/python/cpython.git
synced 2025-10-09 16:34:44 +00:00
gh-96268: Fix loading invalid UTF-8 (GH-96270)
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8.
It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8.
(cherry picked from commit 8bc356a7dd
)
Co-authored-by: Michael Droettboom <mdboom@gmail.com>
This commit is contained in:
parent
9fa21d050a
commit
ffafa9b91d
3 changed files with 57 additions and 16 deletions
|
@ -248,8 +248,10 @@ class UTF8ValidatorTest(unittest.TestCase):
|
||||||
# test it is to write actual files to disk.
|
# test it is to write actual files to disk.
|
||||||
|
|
||||||
# Each example is put inside a string at the top of the file so
|
# Each example is put inside a string at the top of the file so
|
||||||
# it's an otherwise valid Python source file.
|
# it's an otherwise valid Python source file. Put some newlines
|
||||||
template = b'"%s"\n'
|
# beforehand so we can assert that the error is reported on the
|
||||||
|
# correct line.
|
||||||
|
template = b'\n\n\n"%s"\n'
|
||||||
|
|
||||||
fn = TESTFN
|
fn = TESTFN
|
||||||
self.addCleanup(unlink, fn)
|
self.addCleanup(unlink, fn)
|
||||||
|
@ -257,7 +259,12 @@ class UTF8ValidatorTest(unittest.TestCase):
|
||||||
def check(content):
|
def check(content):
|
||||||
with open(fn, 'wb') as fp:
|
with open(fn, 'wb') as fp:
|
||||||
fp.write(template % content)
|
fp.write(template % content)
|
||||||
script_helper.assert_python_failure(fn)
|
rc, stdout, stderr = script_helper.assert_python_failure(fn)
|
||||||
|
# We want to assert that the python subprocess failed gracefully,
|
||||||
|
# not via a signal.
|
||||||
|
self.assertGreaterEqual(rc, 1)
|
||||||
|
self.assertIn(b"Non-UTF-8 code starting with", stderr)
|
||||||
|
self.assertIn(b"on line 4", stderr)
|
||||||
|
|
||||||
# continuation bytes in a sequence of 2, 3, or 4 bytes
|
# continuation bytes in a sequence of 2, 3, or 4 bytes
|
||||||
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
|
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Loading a file with invalid UTF-8 will now report the broken character at
|
||||||
|
the correct location.
|
|
@ -486,25 +486,59 @@ static void fp_ungetc(int c, struct tok_state *tok) {
|
||||||
|
|
||||||
/* Check whether the characters at s start a valid
|
/* Check whether the characters at s start a valid
|
||||||
UTF-8 sequence. Return the number of characters forming
|
UTF-8 sequence. Return the number of characters forming
|
||||||
the sequence if yes, 0 if not. */
|
the sequence if yes, 0 if not. The special cases match
|
||||||
static int valid_utf8(const unsigned char* s)
|
those in stringlib/codecs.h:utf8_decode.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
valid_utf8(const unsigned char* s)
|
||||||
{
|
{
|
||||||
int expected = 0;
|
int expected = 0;
|
||||||
int length;
|
int length;
|
||||||
if (*s < 0x80)
|
if (*s < 0x80) {
|
||||||
/* single-byte code */
|
/* single-byte code */
|
||||||
return 1;
|
return 1;
|
||||||
if (*s < 0xc0)
|
}
|
||||||
/* following byte */
|
else if (*s < 0xE0) {
|
||||||
return 0;
|
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
|
||||||
if (*s < 0xE0)
|
if (*s < 0xC2) {
|
||||||
|
/* invalid sequence
|
||||||
|
\x80-\xBF -- continuation byte
|
||||||
|
\xC0-\xC1 -- fake 0000-007F */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
expected = 1;
|
expected = 1;
|
||||||
else if (*s < 0xF0)
|
}
|
||||||
|
else if (*s < 0xF0) {
|
||||||
|
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
|
||||||
|
if (*s == 0xE0 && *(s + 1) < 0xA0) {
|
||||||
|
/* invalid sequence
|
||||||
|
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (*s == 0xED && *(s + 1) >= 0xA0) {
|
||||||
|
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
|
||||||
|
will result in surrogates in range D800-DFFF. Surrogates are
|
||||||
|
not valid UTF-8 so they are rejected.
|
||||||
|
See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
|
||||||
|
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
expected = 2;
|
expected = 2;
|
||||||
else if (*s < 0xF8)
|
}
|
||||||
|
else if (*s < 0xF5) {
|
||||||
|
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
|
||||||
|
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
|
||||||
|
/* invalid sequence -- one of:
|
||||||
|
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
|
||||||
|
\xF4\x90\x80\x80- -- 110000- overflow */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
expected = 3;
|
expected = 3;
|
||||||
else
|
}
|
||||||
|
else {
|
||||||
|
/* invalid start byte */
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
length = expected + 1;
|
length = expected + 1;
|
||||||
for (; expected; expected--)
|
for (; expected; expected--)
|
||||||
if (s[expected] < 0x80 || s[expected] >= 0xC0)
|
if (s[expected] < 0x80 || s[expected] >= 0xC0)
|
||||||
|
@ -525,14 +559,12 @@ ensure_utf8(char *line, struct tok_state *tok)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (badchar) {
|
if (badchar) {
|
||||||
/* Need to add 1 to the line number, since this line
|
|
||||||
has not been counted, yet. */
|
|
||||||
PyErr_Format(PyExc_SyntaxError,
|
PyErr_Format(PyExc_SyntaxError,
|
||||||
"Non-UTF-8 code starting with '\\x%.2x' "
|
"Non-UTF-8 code starting with '\\x%.2x' "
|
||||||
"in file %U on line %i, "
|
"in file %U on line %i, "
|
||||||
"but no encoding declared; "
|
"but no encoding declared; "
|
||||||
"see https://peps.python.org/pep-0263/ for details",
|
"see https://peps.python.org/pep-0263/ for details",
|
||||||
badchar, tok->filename, tok->lineno + 1);
|
badchar, tok->filename, tok->lineno);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue