mirror of
https://github.com/python/cpython.git
synced 2025-08-31 22:18:28 +00:00
[3.12] gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors (GH-105399) (#105439)
This commit is contained in:
parent
c607551baf
commit
c84d4d165d
6 changed files with 35 additions and 24 deletions
|
@ -139,11 +139,6 @@ function it uses to do this is available:
|
||||||
2,
|
2,
|
||||||
3
|
3
|
||||||
|
|
||||||
Note that unclosed single-quoted strings do not cause an error to be
|
|
||||||
raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
|
|
||||||
tokenization of their contents.
|
|
||||||
|
|
||||||
|
|
||||||
.. _tokenize-cli:
|
.. _tokenize-cli:
|
||||||
|
|
||||||
Command-Line Usage
|
Command-Line Usage
|
||||||
|
|
|
@ -1489,14 +1489,15 @@ Changes in the Python API
|
||||||
Additionally, there may be some minor behavioral changes as a consecuence of the
|
Additionally, there may be some minor behavioral changes as a consecuence of the
|
||||||
changes required to support :pep:`701`. Some of these changes include:
|
changes required to support :pep:`701`. Some of these changes include:
|
||||||
|
|
||||||
* Some final ``DEDENT`` tokens are now emitted within the bounds of the
|
|
||||||
input. This means that for a file containing 3 lines, the old version of the
|
|
||||||
tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
|
|
||||||
the token in line 3.
|
|
||||||
|
|
||||||
* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
|
* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
|
||||||
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
|
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
|
||||||
|
|
||||||
|
* Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
|
||||||
|
multiline strings do.
|
||||||
|
|
||||||
|
* Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
|
||||||
|
returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
|
||||||
|
|
||||||
Build Changes
|
Build Changes
|
||||||
=============
|
=============
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,8 @@ from test.support import os_helper
|
||||||
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
|
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
|
||||||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||||
open as tokenize_open, Untokenizer, generate_tokens,
|
open as tokenize_open, Untokenizer, generate_tokens,
|
||||||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
|
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
|
||||||
|
TokenError)
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
import unittest
|
import unittest
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
@ -286,7 +287,7 @@ def k(x):
|
||||||
for lit in INVALID_UNDERSCORE_LITERALS:
|
for lit in INVALID_UNDERSCORE_LITERALS:
|
||||||
try:
|
try:
|
||||||
number_token(lit)
|
number_token(lit)
|
||||||
except SyntaxError:
|
except TokenError:
|
||||||
continue
|
continue
|
||||||
self.assertNotEqual(number_token(lit), lit)
|
self.assertNotEqual(number_token(lit), lit)
|
||||||
|
|
||||||
|
@ -1379,7 +1380,7 @@ class TestDetectEncoding(TestCase):
|
||||||
self.assertEqual(found, "iso-8859-1")
|
self.assertEqual(found, "iso-8859-1")
|
||||||
|
|
||||||
def test_syntaxerror_latin1(self):
|
def test_syntaxerror_latin1(self):
|
||||||
# Issue 14629: need to raise SyntaxError if the first
|
# Issue 14629: need to raise TokenError if the first
|
||||||
# line(s) have non-UTF-8 characters
|
# line(s) have non-UTF-8 characters
|
||||||
lines = (
|
lines = (
|
||||||
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
|
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
|
||||||
|
@ -2754,7 +2755,7 @@ async def f():
|
||||||
"]",
|
"]",
|
||||||
]:
|
]:
|
||||||
with self.subTest(case=case):
|
with self.subTest(case=case):
|
||||||
self.assertRaises(SyntaxError, get_tokens, case)
|
self.assertRaises(TokenError, get_tokens, case)
|
||||||
|
|
||||||
def test_max_indent(self):
|
def test_max_indent(self):
|
||||||
MAXINDENT = 100
|
MAXINDENT = 100
|
||||||
|
@ -2773,7 +2774,7 @@ async def f():
|
||||||
|
|
||||||
invalid = generate_source(MAXINDENT)
|
invalid = generate_source(MAXINDENT)
|
||||||
the_input = StringIO(invalid)
|
the_input = StringIO(invalid)
|
||||||
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
|
self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
IndentationError, compile, invalid, "<string>", "exec"
|
IndentationError, compile, invalid, "<string>", "exec"
|
||||||
)
|
)
|
||||||
|
|
|
@ -517,14 +517,30 @@ def main():
|
||||||
perror("unexpected error: %s" % err)
|
perror("unexpected error: %s" % err)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def _transform_msg(msg):
|
||||||
|
"""Transform error messages from the C tokenizer into the Python tokenize
|
||||||
|
|
||||||
|
The C tokenizer is more picky than the Python one, so we need to massage
|
||||||
|
the error messages a bit for backwards compatibility.
|
||||||
|
"""
|
||||||
|
if "unterminated triple-quoted string literal" in msg:
|
||||||
|
return "EOF in multi-line string"
|
||||||
|
return msg
|
||||||
|
|
||||||
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
|
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
|
||||||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
|
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
|
||||||
else:
|
else:
|
||||||
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
|
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
|
||||||
for info in it:
|
try:
|
||||||
yield TokenInfo._make(info)
|
for info in it:
|
||||||
|
yield TokenInfo._make(info)
|
||||||
|
except SyntaxError as e:
|
||||||
|
if type(e) != SyntaxError:
|
||||||
|
raise e from None
|
||||||
|
msg = _transform_msg(e.msg)
|
||||||
|
raise TokenError(msg, (e.lineno, e.offset)) from None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Correctly raise :exc:`tokenize.TokenError` exceptions instead of
|
||||||
|
:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
|
||||||
|
Pablo Galindo
|
|
@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
|
||||||
msg = "invalid token";
|
msg = "invalid token";
|
||||||
break;
|
break;
|
||||||
case E_EOF:
|
case E_EOF:
|
||||||
if (tok->level > 0) {
|
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
|
||||||
PyErr_Format(PyExc_SyntaxError,
|
PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
|
||||||
"parenthesis '%c' was never closed",
|
|
||||||
tok->parenstack[tok->level-1]);
|
|
||||||
} else {
|
|
||||||
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
|
|
||||||
}
|
|
||||||
return -1;
|
return -1;
|
||||||
case E_DEDENT:
|
case E_DEDENT:
|
||||||
msg = "unindent does not match any outer indentation level";
|
msg = "unindent does not match any outer indentation level";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue