[3.14] gh-63161: Fix tokenize.detect_encoding() (GH-139446) (GH-140378)

* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error for non-UTF-8 encoding. * Detect null bytes in source code. (cherry picked from commit 38d4b436ca) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-11-24 12:20:42 +00:00 · 2025-10-20 19:34:56 +02:00 · 2025-10-20 19:34:56 +02:00 · 8e93f6e203
commit 8e93f6e203
parent abb3b3142b
3 changed files with 94 additions and 8 deletions
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -1495,6 +1495,61 @@ class TestDetectEncoding(TestCase):
        expected = [b"print('\xc2\xa3')\n"]
        self.assertEqual(consumed_lines, expected)
    def test_first_non_utf8_coding_line(self):
        lines = (
            b'#coding:iso-8859-15 \xa4\n',
            b'print(something)\n'
        )
        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'iso-8859-15')
        self.assertEqual(consumed_lines, list(lines[:1]))
    def test_first_utf8_coding_line_error(self):
        lines = (
            b'#coding:ascii \xc3\xa4\n',
            b'print(something)\n'
        )
        with self.assertRaises(SyntaxError):
            tokenize.detect_encoding(self.get_readline(lines))
    def test_second_non_utf8_coding_line(self):
        lines = (
            b'#!/usr/bin/python\n',
            b'#coding:iso-8859-15 \xa4\n',
            b'print(something)\n'
        )
        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'iso-8859-15')
        self.assertEqual(consumed_lines, list(lines[:2]))
    def test_second_utf8_coding_line_error(self):
        lines = (
            b'#!/usr/bin/python\n',
            b'#coding:ascii \xc3\xa4\n',
            b'print(something)\n'
        )
        with self.assertRaises(SyntaxError):
            tokenize.detect_encoding(self.get_readline(lines))
    def test_non_utf8_shebang(self):
        lines = (
            b'#!/home/\xa4/bin/python\n',
            b'#coding:iso-8859-15\n',
            b'print(something)\n'
        )
        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'iso-8859-15')
        self.assertEqual(consumed_lines, list(lines[:2]))
    def test_utf8_shebang_error(self):
        lines = (
            b'#!/home/\xc3\xa4/bin/python\n',
            b'#coding:ascii\n',
            b'print(something)\n'
        )
        with self.assertRaises(SyntaxError):
            tokenize.detect_encoding(self.get_readline(lines))
    def test_cookie_second_line_empty_first_line(self):
        lines = (
            b'\n',
@ -1548,6 +1603,28 @@ class TestDetectEncoding(TestCase):
        self.assertEqual(encoding, 'utf-8')
        self.assertEqual(consumed_lines, list(lines[:1]))
    def test_nul_in_first_coding_line(self):
        lines = (
            b'#coding:iso8859-15\x00\n',
            b'\n',
            b'\n',
            b'print(something)\n'
        )
        with self.assertRaisesRegex(SyntaxError,
                "source code cannot contain null bytes"):
            tokenize.detect_encoding(self.get_readline(lines))
    def test_nul_in_second_coding_line(self):
        lines = (
            b'#!/usr/bin/python\n',
            b'#coding:iso8859-15\x00\n',
            b'\n',
            b'print(something)\n'
        )
        with self.assertRaisesRegex(SyntaxError,
                "source code cannot contain null bytes"):
            tokenize.detect_encoding(self.get_readline(lines))
    def test_latin1_normalization(self):
        # See get_normal_name() in Parser/tokenizer/helpers.c.
        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -36,7 +36,7 @@ from token import *
 from token import EXACT_TOKEN_TYPES
 import _tokenize
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
+cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 import token
@ -385,22 +385,23 @@ def detect_encoding(readline):
        except StopIteration:
            return b''
-    def find_cookie(line):
+    def check(line, encoding):
        # Check if the line matches the encoding.
        if 0 in line:
            raise SyntaxError("source code cannot contain null bytes")
        try:
-            # Decode as UTF-8. Either the line is an encoding declaration,
+            line.decode(encoding)
            # in which case it should be pure ASCII, or it must be UTF-8
            # per default encoding.
            line_string = line.decode('utf-8')
        except UnicodeDecodeError:
            msg = "invalid or missing encoding declaration"
            if filename is not None:
                msg = '{} for {!r}'.format(msg, filename)
            raise SyntaxError(msg)
-        match = cookie_re.match(line_string)
+    def find_cookie(line):
        match = cookie_re.match(line)
        if not match:
            return None
-        encoding = _get_normal_name(match.group(1))
+        encoding = _get_normal_name(match.group(1).decode())
        try:
            codec = lookup(encoding)
        except LookupError:
@ -433,18 +434,23 @@ def detect_encoding(readline):
    encoding = find_cookie(first)
    if encoding:
        check(first, encoding)
        return encoding, [first]
    if not blank_re.match(first):
        check(first, default)
        return default, [first]
    second = read_or_stop()
    if not second:
        check(first, default)
        return default, [first]
    encoding = find_cookie(second)
    if encoding:
        check(first + second, encoding)
        return encoding, [first, second]
    check(first + second, default)
    return default, [first, second]
--- a/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
+++ b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
@ -0,0 +1,3 @@
 Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
 if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
 encoding. Detect null bytes in source code.