gh-102856: Python tokenizer implementation for PEP 701 (#104323)

This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
2025-09-27 10:50:04 +00:00 · 2023-05-21 02:03:02 +02:00 · 2023-05-21 02:03:02 +02:00 · 6715f91edc
commit 6715f91edc
parent 3ed57e4995
22 changed files with 424 additions and 374 deletions
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -111,6 +111,8 @@ tok_new(void)
    tok->interactive_underflow = IUNDERFLOW_NORMAL;
    tok->str = NULL;
    tok->report_warnings = 1;
+    tok->tok_extra_tokens = 0;
+    tok->comment_newline = 0;
    tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
    tok->tok_mode_stack_index = 0;
    tok->tok_report_warnings = 1;
@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
    PyMem_Free(tok);
 }

+void
+_PyToken_Free(struct token *token) {
+    Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+    token->metadata = NULL;
+}
+
 static int
 tok_readline_raw(struct tok_state *tok)
 {
@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
    return type;
 }

+
 static int
 tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
 {
@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
    tok->starting_col_offset = -1;
    blankline = 0;

+
    /* Get indentation level */
    if (tok->atbol) {
        int col = 0;
@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
    tok->starting_col_offset = tok->col_offset;

    /* Return pending indents/dedents */
-    if (tok->pendin != 0) {
+   if (tok->pendin != 0) {
        if (tok->pendin < 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->cur;
+                p_end = tok->cur;
+            }
            tok->pendin++;
            return MAKE_TOKEN(DEDENT);
        }
        else {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->buf;
+                p_end = tok->cur;
+            }
            tok->pendin--;
            return MAKE_TOKEN(INDENT);
        }
@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
            return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
        }

-        const char *prefix, *p, *type_start;
+        const char* p = NULL;
+        const char *prefix, *type_start;
        int current_starting_col_offset;

        while (c != EOF && c != '\n') {
            c = tok_nextc(tok);
        }

+        if (tok->tok_extra_tokens) {
+            p = tok->start;
+        }
+
        if (tok->type_comments) {
            p = tok->start;
            current_starting_col_offset = tok->starting_col_offset;
@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                }
            }
        }
+        if (tok->tok_extra_tokens) {
+            tok_backup(tok, c);  /* don't eat the newline or EOF */
+            p_start = p;
+            p_end = tok->cur;
+            tok->comment_newline = blankline;
+            return MAKE_TOKEN(COMMENT);
+        }
    }

    if (tok->done == E_INTERACT_STOP) {
@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t

                struct tok_state ahead_tok;
                struct token ahead_token;
+                _PyToken_Init(&ahead_token);
                int ahead_tok_kind;

                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                       returning a plain NAME token, return ASYNC. */
                    tok->async_def_indent = tok->indent;
                    tok->async_def = 1;
+                    _PyToken_Free(&ahead_token);
                    return MAKE_TOKEN(ASYNC);
                }
+                _PyToken_Free(&ahead_token);
            }
        }

@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
    if (c == '\n') {
        tok->atbol = 1;
        if (blankline || tok->level > 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+            }
            goto nextline;
        }
+        if (tok->comment_newline && tok->tok_extra_tokens) {
+            tok->comment_newline = 0;
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+        }
        p_start = tok->start;
        p_end = tok->cur - 1; /* Leave '\n' out of the string */
        tok->cont_line = 0;
@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct

 f_string_middle:

+    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+    // this.
+    tok->multi_line_start = tok->line_start;
    while (end_quote_size != current_tok->f_string_quote_size) {
        int c = tok_nextc(tok);
        if (tok->done == E_ERROR) {
@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
    // if fetching the encoding shows a warning.
    tok->report_warnings = 0;
    while (tok->lineno < 2 && tok->done == E_OK) {
+        _PyToken_Init(&token);
        _PyTokenizer_Get(tok, &token);
+        _PyToken_Free(&token);
    }
    fclose(fp);
    if (tok->encoding) {