[3.12] gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors (GH-105399) (#105439)

2025-08-31 22:18:28 +00:00 · 2023-06-07 04:38:36 -07:00 · 2023-06-07 04:38:36 -07:00 · c84d4d165d
commit c84d4d165d
parent c607551baf
6 changed files with 35 additions and 24 deletions
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@ -139,11 +139,6 @@ function it uses to do this is available:
       2,
       3
 Note that unclosed single-quoted strings do not cause an error to be
 raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
 tokenization of their contents.
 .. _tokenize-cli:
 Command-Line Usage
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@ -1489,14 +1489,15 @@ Changes in the Python API
  Additionally, there may be some minor behavioral changes as a consecuence of the
  changes required to support :pep:`701`. Some of these changes include:
  * Some final ``DEDENT`` tokens are now emitted within the bounds of the
    input. This means that for a file containing 3 lines, the old version of the
    tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
    the token in line 3.
  * The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
    characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
  * Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
    multiline strings do.
  * Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
    returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
 Build Changes
 =============
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -3,7 +3,8 @@ from test.support import os_helper
 from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                     open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
+                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
                     TokenError)
 from io import BytesIO, StringIO
 import unittest
 from textwrap import dedent
@ -286,7 +287,7 @@ def k(x):
        for lit in INVALID_UNDERSCORE_LITERALS:
            try:
                number_token(lit)
-            except SyntaxError:
+            except TokenError:
                continue
            self.assertNotEqual(number_token(lit), lit)
@ -1379,7 +1380,7 @@ class TestDetectEncoding(TestCase):
                self.assertEqual(found, "iso-8859-1")
    def test_syntaxerror_latin1(self):
-        # Issue 14629: need to raise SyntaxError if the first
+        # Issue 14629: need to raise TokenError if the first
        # line(s) have non-UTF-8 characters
        lines = (
            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
@ -2754,7 +2755,7 @@ async def f():
            "]",
        ]:
            with self.subTest(case=case):
-                self.assertRaises(SyntaxError, get_tokens, case)
+                self.assertRaises(TokenError, get_tokens, case)
    def test_max_indent(self):
        MAXINDENT = 100
@ -2773,7 +2774,7 @@ async def f():
        invalid = generate_source(MAXINDENT)
        the_input = StringIO(invalid)
-        self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
+        self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
        self.assertRaises(
            IndentationError, compile, invalid, "<string>", "exec"
        )
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -517,14 +517,30 @@ def main():
        perror("unexpected error: %s" % err)
        raise
 def _transform_msg(msg):
    """Transform error messages from the C tokenizer into the Python tokenize
    The C tokenizer is more picky than the Python one, so we need to massage
    the error messages a bit for backwards compatibility.
    """
    if "unterminated triple-quoted string literal" in msg:
        return "EOF in multi-line string"
    return msg
 def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
    if encoding is None:
        it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
    else:
        it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
-    for info in it:
+    try:
-        yield TokenInfo._make(info)
+        for info in it:
            yield TokenInfo._make(info)
    except SyntaxError as e:
        if type(e) != SyntaxError:
            raise e from None
        msg = _transform_msg(e.msg)
        raise TokenError(msg, (e.lineno, e.offset)) from None
 if __name__ == "__main__":
--- a/Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst
+++ b/Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst
@ -0,0 +1,3 @@
 Correctly raise :exc:`tokenize.TokenError` exceptions instead of
 :exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
 Pablo Galindo
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
            msg = "invalid token";
            break;
        case E_EOF:
-            if (tok->level > 0) {
+            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
-                    PyErr_Format(PyExc_SyntaxError,
+            PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
                                 "parenthesis '%c' was never closed",
                                tok->parenstack[tok->level-1]);
            } else {
                PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
            }
            return -1;
        case E_DEDENT:
            msg = "unindent does not match any outer indentation level";