Issue #14629: Raise SyntaxError in tokenizer.detect_encoding

if the first two lines have non-UTF-8 characters without an encoding declaration.
2025-10-17 12:18:23 +00:00 · 2012-04-20 14:36:47 +02:00 · 2012-04-20 14:36:47 +02:00 · 63674f4b52
commit 63674f4b52
parent 8e6e0fdb7f
3 changed files with 18 additions and 2 deletions
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -825,6 +825,16 @@ class TestDetectEncoding(TestCase):
                found, consumed_lines = detect_encoding(rl)
                self.assertEqual(found, "iso-8859-1")

+    def test_syntaxerror_latin1(self):
+        # Issue 14629: need to raise SyntaxError if the first
+        # line(s) have non-UTF-8 characters
+        lines = (
+            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
+            )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+
    def test_utf8_normalization(self):
        # See get_normal_name() in tokenizer.c.
        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")