gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer (#104980)

Signed-off-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
Pablo Galindo Salgado 2023-05-26 22:02:26 +01:00 committed by GitHub
parent 402ee5a68b
commit 46b52e6e2b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 34 additions and 15 deletions

View file

@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
typedef struct
{
PyObject_HEAD struct tok_state *tok;
int done;
} tokenizeriterobject;
/*[clinic input]
@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (extra_tokens) {
self->tok->tok_extra_tokens = 1;
}
self->done = 0;
return (PyObject *)self;
}
@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)
}
goto exit;
}
if (type == ERRORTOKEN || type == ENDMARKER) {
if (it->done || type == ERRORTOKEN) {
PyErr_SetString(PyExc_StopIteration, "EOF");
it->done = 1;
goto exit;
}
PyObject *str = NULL;
@ -194,9 +197,19 @@ tokenizeriter_next(tokenizeriterobject *it)
goto exit;
}
int is_trailing_token = 0;
if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
is_trailing_token = 1;
}
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
Py_ssize_t size = it->tok->inp - line_start;
PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
PyObject* line = NULL;
if (it->tok->tok_extra_tokens && is_trailing_token) {
line = PyUnicode_FromString("");
} else {
Py_ssize_t size = it->tok->inp - line_start;
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
}
if (line == NULL) {
Py_DECREF(str);
goto exit;
@ -214,6 +227,10 @@ tokenizeriter_next(tokenizeriterobject *it)
}
if (it->tok->tok_extra_tokens) {
if (is_trailing_token) {
lineno = end_lineno = lineno + 1;
col_offset = end_col_offset = 0;
}
// Necessary adjustments to match the original Python tokenize
// implementation
if (type > DEDENT && type < OP) {
@ -231,6 +248,9 @@ tokenizeriter_next(tokenizeriterobject *it)
result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
exit:
_PyToken_Free(&token);
if (type == ENDMARKER) {
it->done = 1;
}
return result;
}