mirror of
https://github.com/python/cpython.git
synced 2025-09-27 10:50:04 +00:00
gh-102856: Python tokenizer implementation for PEP 701 (#104323)
This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
parent
3ed57e4995
commit
6715f91edc
22 changed files with 424 additions and 374 deletions
|
@ -111,6 +111,8 @@ tok_new(void)
|
|||
tok->interactive_underflow = IUNDERFLOW_NORMAL;
|
||||
tok->str = NULL;
|
||||
tok->report_warnings = 1;
|
||||
tok->tok_extra_tokens = 0;
|
||||
tok->comment_newline = 0;
|
||||
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
|
||||
tok->tok_mode_stack_index = 0;
|
||||
tok->tok_report_warnings = 1;
|
||||
|
@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
|
|||
PyMem_Free(tok);
|
||||
}
|
||||
|
||||
void
|
||||
_PyToken_Free(struct token *token) {
|
||||
Py_XDECREF(token->metadata);
|
||||
}
|
||||
|
||||
void
|
||||
_PyToken_Init(struct token *token) {
|
||||
token->metadata = NULL;
|
||||
}
|
||||
|
||||
static int
|
||||
tok_readline_raw(struct tok_state *tok)
|
||||
{
|
||||
|
@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
|
|||
return type;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
|
||||
{
|
||||
|
@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
tok->starting_col_offset = -1;
|
||||
blankline = 0;
|
||||
|
||||
|
||||
/* Get indentation level */
|
||||
if (tok->atbol) {
|
||||
int col = 0;
|
||||
|
@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
tok->starting_col_offset = tok->col_offset;
|
||||
|
||||
/* Return pending indents/dedents */
|
||||
if (tok->pendin != 0) {
|
||||
if (tok->pendin != 0) {
|
||||
if (tok->pendin < 0) {
|
||||
if (tok->tok_extra_tokens) {
|
||||
p_start = tok->cur;
|
||||
p_end = tok->cur;
|
||||
}
|
||||
tok->pendin++;
|
||||
return MAKE_TOKEN(DEDENT);
|
||||
}
|
||||
else {
|
||||
if (tok->tok_extra_tokens) {
|
||||
p_start = tok->buf;
|
||||
p_end = tok->cur;
|
||||
}
|
||||
tok->pendin--;
|
||||
return MAKE_TOKEN(INDENT);
|
||||
}
|
||||
|
@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
|
||||
}
|
||||
|
||||
const char *prefix, *p, *type_start;
|
||||
const char* p = NULL;
|
||||
const char *prefix, *type_start;
|
||||
int current_starting_col_offset;
|
||||
|
||||
while (c != EOF && c != '\n') {
|
||||
c = tok_nextc(tok);
|
||||
}
|
||||
|
||||
if (tok->tok_extra_tokens) {
|
||||
p = tok->start;
|
||||
}
|
||||
|
||||
if (tok->type_comments) {
|
||||
p = tok->start;
|
||||
current_starting_col_offset = tok->starting_col_offset;
|
||||
|
@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
}
|
||||
}
|
||||
}
|
||||
if (tok->tok_extra_tokens) {
|
||||
tok_backup(tok, c); /* don't eat the newline or EOF */
|
||||
p_start = p;
|
||||
p_end = tok->cur;
|
||||
tok->comment_newline = blankline;
|
||||
return MAKE_TOKEN(COMMENT);
|
||||
}
|
||||
}
|
||||
|
||||
if (tok->done == E_INTERACT_STOP) {
|
||||
|
@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
|
||||
struct tok_state ahead_tok;
|
||||
struct token ahead_token;
|
||||
_PyToken_Init(&ahead_token);
|
||||
int ahead_tok_kind;
|
||||
|
||||
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
|
||||
|
@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
returning a plain NAME token, return ASYNC. */
|
||||
tok->async_def_indent = tok->indent;
|
||||
tok->async_def = 1;
|
||||
_PyToken_Free(&ahead_token);
|
||||
return MAKE_TOKEN(ASYNC);
|
||||
}
|
||||
_PyToken_Free(&ahead_token);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
|
|||
if (c == '\n') {
|
||||
tok->atbol = 1;
|
||||
if (blankline || tok->level > 0) {
|
||||
if (tok->tok_extra_tokens) {
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(NL);
|
||||
}
|
||||
goto nextline;
|
||||
}
|
||||
if (tok->comment_newline && tok->tok_extra_tokens) {
|
||||
tok->comment_newline = 0;
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur;
|
||||
return MAKE_TOKEN(NL);
|
||||
}
|
||||
p_start = tok->start;
|
||||
p_end = tok->cur - 1; /* Leave '\n' out of the string */
|
||||
tok->cont_line = 0;
|
||||
|
@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
|
|||
|
||||
f_string_middle:
|
||||
|
||||
// TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
|
||||
// this.
|
||||
tok->multi_line_start = tok->line_start;
|
||||
while (end_quote_size != current_tok->f_string_quote_size) {
|
||||
int c = tok_nextc(tok);
|
||||
if (tok->done == E_ERROR) {
|
||||
|
@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
|||
// if fetching the encoding shows a warning.
|
||||
tok->report_warnings = 0;
|
||||
while (tok->lineno < 2 && tok->done == E_OK) {
|
||||
_PyToken_Init(&token);
|
||||
_PyTokenizer_Get(tok, &token);
|
||||
_PyToken_Free(&token);
|
||||
}
|
||||
fclose(fp);
|
||||
if (tok->encoding) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue