gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer (#104980)

Signed-off-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
Pablo Galindo Salgado 2023-05-26 22:02:26 +01:00 committed by GitHub
parent 402ee5a68b
commit 46b52e6e2b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 34 additions and 15 deletions

View file

@ -82,7 +82,7 @@ class TokenizeTest(TestCase):
NAME 'False' (4, 11) (4, 16)
COMMENT '# NEWLINE' (4, 17) (4, 26)
NEWLINE '\\n' (4, 26) (4, 27)
DEDENT '' (4, 27) (4, 27)
DEDENT '' (5, 0) (5, 0)
""")
indent_error_file = b"""\
def k(x):
@ -755,8 +755,8 @@ def"', """\
NEWLINE '\\n' (2, 5) (2, 6)
INDENT ' \\t' (3, 0) (3, 9)
NAME 'pass' (3, 9) (3, 13)
DEDENT '' (3, 14) (3, 14)
DEDENT '' (3, 14) (3, 14)
DEDENT '' (4, 0) (4, 0)
DEDENT '' (4, 0) (4, 0)
""")
def test_non_ascii_identifiers(self):
@ -968,7 +968,7 @@ async def foo():
NUMBER '1' (2, 17) (2, 18)
OP ':' (2, 18) (2, 19)
NAME 'pass' (2, 20) (2, 24)
DEDENT '' (2, 25) (2, 25)
DEDENT '' (3, 0) (3, 0)
""")
self.check_tokenize('''async def foo(async): await''', """\
@ -1016,7 +1016,7 @@ def f():
NAME 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (6, 12) (6, 12)
DEDENT '' (7, 0) (7, 0)
""")
self.check_tokenize('''\
@ -1054,7 +1054,7 @@ async def f():
NAME 'await' (6, 2) (6, 7)
OP '=' (6, 8) (6, 9)
NUMBER '2' (6, 10) (6, 11)
DEDENT '' (6, 12) (6, 12)
DEDENT '' (7, 0) (7, 0)
""")
def test_newline_after_parenthesized_block_with_comment(self):
@ -2680,7 +2680,8 @@ async def f():
valid = generate_source(MAXINDENT - 1)
tokens = list(_generate_tokens_from_c_tokenizer(valid))
self.assertEqual(tokens[-1].type, DEDENT)
self.assertEqual(tokens[-2].type, DEDENT)
self.assertEqual(tokens[-1].type, ENDMARKER)
compile(valid, "<string>", "exec")
invalid = generate_source(MAXINDENT)

View file

@ -447,13 +447,8 @@ def tokenize(readline):
def _tokenize(rl_gen, encoding):
source = b"".join(rl_gen).decode(encoding)
token = None
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
yield token
if token is not None:
last_line, _ = token.start
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
def generate_tokens(readline):
"""Tokenize a source reading Python code as unicode strings.

View file

@ -0,0 +1,3 @@
Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted
by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo
Galindo

View file

@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
typedef struct
{
PyObject_HEAD struct tok_state *tok;
int done;
} tokenizeriterobject;
/*[clinic input]
@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (extra_tokens) {
self->tok->tok_extra_tokens = 1;
}
self->done = 0;
return (PyObject *)self;
}
@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)
}
goto exit;
}
if (type == ERRORTOKEN || type == ENDMARKER) {
if (it->done || type == ERRORTOKEN) {
PyErr_SetString(PyExc_StopIteration, "EOF");
it->done = 1;
goto exit;
}
PyObject *str = NULL;
@ -194,9 +197,19 @@ tokenizeriter_next(tokenizeriterobject *it)
goto exit;
}
int is_trailing_token = 0;
if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
is_trailing_token = 1;
}
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
Py_ssize_t size = it->tok->inp - line_start;
PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
PyObject* line = NULL;
if (it->tok->tok_extra_tokens && is_trailing_token) {
line = PyUnicode_FromString("");
} else {
Py_ssize_t size = it->tok->inp - line_start;
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
}
if (line == NULL) {
Py_DECREF(str);
goto exit;
@ -214,6 +227,10 @@ tokenizeriter_next(tokenizeriterobject *it)
}
if (it->tok->tok_extra_tokens) {
if (is_trailing_token) {
lineno = end_lineno = lineno + 1;
col_offset = end_col_offset = 0;
}
// Necessary adjustments to match the original Python tokenize
// implementation
if (type > DEDENT && type < OP) {
@ -231,6 +248,9 @@ tokenizeriter_next(tokenizeriterobject *it)
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
exit:
_PyToken_Free(&token);
if (type == ENDMARKER) {
it->done = 1;
}
return result;
}