mirror of
https://github.com/python/cpython.git
synced 2025-11-24 12:20:42 +00:00
[3.14] gh-63161: Fix tokenize.detect_encoding() (GH-139446) (GH-140378)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.
* Detect decoding error for non-UTF-8 encoding.
* Detect null bytes in source code.
(cherry picked from commit 38d4b436ca)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
abb3b3142b
commit
8e93f6e203
3 changed files with 94 additions and 8 deletions
|
|
@ -1495,6 +1495,61 @@ class TestDetectEncoding(TestCase):
|
||||||
expected = [b"print('\xc2\xa3')\n"]
|
expected = [b"print('\xc2\xa3')\n"]
|
||||||
self.assertEqual(consumed_lines, expected)
|
self.assertEqual(consumed_lines, expected)
|
||||||
|
|
||||||
|
def test_first_non_utf8_coding_line(self):
|
||||||
|
lines = (
|
||||||
|
b'#coding:iso-8859-15 \xa4\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
self.assertEqual(encoding, 'iso-8859-15')
|
||||||
|
self.assertEqual(consumed_lines, list(lines[:1]))
|
||||||
|
|
||||||
|
def test_first_utf8_coding_line_error(self):
|
||||||
|
lines = (
|
||||||
|
b'#coding:ascii \xc3\xa4\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
with self.assertRaises(SyntaxError):
|
||||||
|
tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
|
||||||
|
def test_second_non_utf8_coding_line(self):
|
||||||
|
lines = (
|
||||||
|
b'#!/usr/bin/python\n',
|
||||||
|
b'#coding:iso-8859-15 \xa4\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
self.assertEqual(encoding, 'iso-8859-15')
|
||||||
|
self.assertEqual(consumed_lines, list(lines[:2]))
|
||||||
|
|
||||||
|
def test_second_utf8_coding_line_error(self):
|
||||||
|
lines = (
|
||||||
|
b'#!/usr/bin/python\n',
|
||||||
|
b'#coding:ascii \xc3\xa4\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
with self.assertRaises(SyntaxError):
|
||||||
|
tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
|
||||||
|
def test_non_utf8_shebang(self):
|
||||||
|
lines = (
|
||||||
|
b'#!/home/\xa4/bin/python\n',
|
||||||
|
b'#coding:iso-8859-15\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
self.assertEqual(encoding, 'iso-8859-15')
|
||||||
|
self.assertEqual(consumed_lines, list(lines[:2]))
|
||||||
|
|
||||||
|
def test_utf8_shebang_error(self):
|
||||||
|
lines = (
|
||||||
|
b'#!/home/\xc3\xa4/bin/python\n',
|
||||||
|
b'#coding:ascii\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
with self.assertRaises(SyntaxError):
|
||||||
|
tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
|
||||||
def test_cookie_second_line_empty_first_line(self):
|
def test_cookie_second_line_empty_first_line(self):
|
||||||
lines = (
|
lines = (
|
||||||
b'\n',
|
b'\n',
|
||||||
|
|
@ -1548,6 +1603,28 @@ class TestDetectEncoding(TestCase):
|
||||||
self.assertEqual(encoding, 'utf-8')
|
self.assertEqual(encoding, 'utf-8')
|
||||||
self.assertEqual(consumed_lines, list(lines[:1]))
|
self.assertEqual(consumed_lines, list(lines[:1]))
|
||||||
|
|
||||||
|
def test_nul_in_first_coding_line(self):
|
||||||
|
lines = (
|
||||||
|
b'#coding:iso8859-15\x00\n',
|
||||||
|
b'\n',
|
||||||
|
b'\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
with self.assertRaisesRegex(SyntaxError,
|
||||||
|
"source code cannot contain null bytes"):
|
||||||
|
tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
|
||||||
|
def test_nul_in_second_coding_line(self):
|
||||||
|
lines = (
|
||||||
|
b'#!/usr/bin/python\n',
|
||||||
|
b'#coding:iso8859-15\x00\n',
|
||||||
|
b'\n',
|
||||||
|
b'print(something)\n'
|
||||||
|
)
|
||||||
|
with self.assertRaisesRegex(SyntaxError,
|
||||||
|
"source code cannot contain null bytes"):
|
||||||
|
tokenize.detect_encoding(self.get_readline(lines))
|
||||||
|
|
||||||
def test_latin1_normalization(self):
|
def test_latin1_normalization(self):
|
||||||
# See get_normal_name() in Parser/tokenizer/helpers.c.
|
# See get_normal_name() in Parser/tokenizer/helpers.c.
|
||||||
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ from token import *
|
||||||
from token import EXACT_TOKEN_TYPES
|
from token import EXACT_TOKEN_TYPES
|
||||||
import _tokenize
|
import _tokenize
|
||||||
|
|
||||||
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
|
||||||
import token
|
import token
|
||||||
|
|
@ -385,22 +385,23 @@ def detect_encoding(readline):
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
return b''
|
return b''
|
||||||
|
|
||||||
def find_cookie(line):
|
def check(line, encoding):
|
||||||
|
# Check if the line matches the encoding.
|
||||||
|
if 0 in line:
|
||||||
|
raise SyntaxError("source code cannot contain null bytes")
|
||||||
try:
|
try:
|
||||||
# Decode as UTF-8. Either the line is an encoding declaration,
|
line.decode(encoding)
|
||||||
# in which case it should be pure ASCII, or it must be UTF-8
|
|
||||||
# per default encoding.
|
|
||||||
line_string = line.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
msg = "invalid or missing encoding declaration"
|
msg = "invalid or missing encoding declaration"
|
||||||
if filename is not None:
|
if filename is not None:
|
||||||
msg = '{} for {!r}'.format(msg, filename)
|
msg = '{} for {!r}'.format(msg, filename)
|
||||||
raise SyntaxError(msg)
|
raise SyntaxError(msg)
|
||||||
|
|
||||||
match = cookie_re.match(line_string)
|
def find_cookie(line):
|
||||||
|
match = cookie_re.match(line)
|
||||||
if not match:
|
if not match:
|
||||||
return None
|
return None
|
||||||
encoding = _get_normal_name(match.group(1))
|
encoding = _get_normal_name(match.group(1).decode())
|
||||||
try:
|
try:
|
||||||
codec = lookup(encoding)
|
codec = lookup(encoding)
|
||||||
except LookupError:
|
except LookupError:
|
||||||
|
|
@ -433,18 +434,23 @@ def detect_encoding(readline):
|
||||||
|
|
||||||
encoding = find_cookie(first)
|
encoding = find_cookie(first)
|
||||||
if encoding:
|
if encoding:
|
||||||
|
check(first, encoding)
|
||||||
return encoding, [first]
|
return encoding, [first]
|
||||||
if not blank_re.match(first):
|
if not blank_re.match(first):
|
||||||
|
check(first, default)
|
||||||
return default, [first]
|
return default, [first]
|
||||||
|
|
||||||
second = read_or_stop()
|
second = read_or_stop()
|
||||||
if not second:
|
if not second:
|
||||||
|
check(first, default)
|
||||||
return default, [first]
|
return default, [first]
|
||||||
|
|
||||||
encoding = find_cookie(second)
|
encoding = find_cookie(second)
|
||||||
if encoding:
|
if encoding:
|
||||||
|
check(first + second, encoding)
|
||||||
return encoding, [first, second]
|
return encoding, [first, second]
|
||||||
|
|
||||||
|
check(first + second, default)
|
||||||
return default, [first, second]
|
return default, [first, second]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
|
||||||
|
if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
|
||||||
|
encoding. Detect null bytes in source code.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue