gh-102856: Python tokenizer implementation for PEP 701 (#104323)

This commit replaces the Python implementation of the tokenize module with an implementation
that reuses the real C tokenizer via a private extension module. The tokenize module now implements
a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward
compatibility.

As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via
the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation.

Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
Marta Gómez Macías 2023-05-21 02:03:02 +02:00 committed by GitHub
parent 3ed57e4995
commit 6715f91edc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 424 additions and 374 deletions

View file

@ -111,6 +111,8 @@ tok_new(void)
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->str = NULL;
tok->report_warnings = 1;
tok->tok_extra_tokens = 0;
tok->comment_newline = 0;
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
tok->tok_mode_stack_index = 0;
tok->tok_report_warnings = 1;
@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
PyMem_Free(tok);
}
void
_PyToken_Free(struct token *token) {
Py_XDECREF(token->metadata);
}
void
_PyToken_Init(struct token *token) {
token->metadata = NULL;
}
static int
tok_readline_raw(struct tok_state *tok)
{
@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
return type;
}
static int
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
{
@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
tok->starting_col_offset = -1;
blankline = 0;
/* Get indentation level */
if (tok->atbol) {
int col = 0;
@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
tok->starting_col_offset = tok->col_offset;
/* Return pending indents/dedents */
if (tok->pendin != 0) {
if (tok->pendin != 0) {
if (tok->pendin < 0) {
if (tok->tok_extra_tokens) {
p_start = tok->cur;
p_end = tok->cur;
}
tok->pendin++;
return MAKE_TOKEN(DEDENT);
}
else {
if (tok->tok_extra_tokens) {
p_start = tok->buf;
p_end = tok->cur;
}
tok->pendin--;
return MAKE_TOKEN(INDENT);
}
@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
}
const char *prefix, *p, *type_start;
const char* p = NULL;
const char *prefix, *type_start;
int current_starting_col_offset;
while (c != EOF && c != '\n') {
c = tok_nextc(tok);
}
if (tok->tok_extra_tokens) {
p = tok->start;
}
if (tok->type_comments) {
p = tok->start;
current_starting_col_offset = tok->starting_col_offset;
@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
}
if (tok->tok_extra_tokens) {
tok_backup(tok, c); /* don't eat the newline or EOF */
p_start = p;
p_end = tok->cur;
tok->comment_newline = blankline;
return MAKE_TOKEN(COMMENT);
}
}
if (tok->done == E_INTERACT_STOP) {
@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
struct tok_state ahead_tok;
struct token ahead_token;
_PyToken_Init(&ahead_token);
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
returning a plain NAME token, return ASYNC. */
tok->async_def_indent = tok->indent;
tok->async_def = 1;
_PyToken_Free(&ahead_token);
return MAKE_TOKEN(ASYNC);
}
_PyToken_Free(&ahead_token);
}
}
@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
if (c == '\n') {
tok->atbol = 1;
if (blankline || tok->level > 0) {
if (tok->tok_extra_tokens) {
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(NL);
}
goto nextline;
}
if (tok->comment_newline && tok->tok_extra_tokens) {
tok->comment_newline = 0;
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(NL);
}
p_start = tok->start;
p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
f_string_middle:
// TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
// this.
tok->multi_line_start = tok->line_start;
while (end_quote_size != current_tok->f_string_quote_size) {
int c = tok_nextc(tok);
if (tok->done == E_ERROR) {
@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
// if fetching the encoding shows a warning.
tok->report_warnings = 0;
while (tok->lineno < 2 && tok->done == E_OK) {
_PyToken_Init(&token);
_PyTokenizer_Get(tok, &token);
_PyToken_Free(&token);
}
fclose(fp);
if (tok->encoding) {