gh-104169: Refactor tokenizer into lexer and wrappers (#110684)

* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
2025-10-17 04:08:28 +00:00 · 2023-10-11 17:14:44 +02:00 · 2023-10-11 17:14:44 +02:00 · 01481f2dc1
commit 01481f2dc1
parent eb50cd37ea
29 changed files with 3185 additions and 2988 deletions
--- a/Parser/lexer/buffer.c
+++ b/Parser/lexer/buffer.c
@ -0,0 +1,76 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "state.h"
+
+/* Traverse and remember all f-string buffers, in order to be able to restore
+   them after reallocating tok->buf */
+void
+_PyLexer_remember_fstring_buffers(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        mode->f_string_start_offset = mode->f_string_start - tok->buf;
+        mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
+    }
+}
+
+/* Traverse and restore all f-string buffers after reallocating tok->buf */
+void
+_PyLexer_restore_fstring_buffers(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        mode->f_string_start = tok->buf + mode->f_string_start_offset;
+        mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
+    }
+}
+
+/* Read a line of text from TOK into S, using the stream in TOK.
+   Return NULL on failure, else S.
+
+   On entry, tok->decoding_buffer will be one of:
+     1) NULL: need to call tok->decoding_readline to get a new line
+     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
+       stored the result in tok->decoding_buffer
+     3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
+       (in the s buffer) to copy entire contents of the line read
+       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
+       In this case, tok_readline_recode is called in a loop (with an expanded buffer)
+       until the buffer ends with a '\n' (or until the end of the file is
+       reached): see tok_nextc and its calls to tok_reserve_buf.
+*/
+int
+_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
+{
+    Py_ssize_t cur = tok->cur - tok->buf;
+    Py_ssize_t oldsize = tok->inp - tok->buf;
+    Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
+    if (newsize > tok->end - tok->buf) {
+        char *newbuf = tok->buf;
+        Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
+        Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
+        Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
+        _PyLexer_remember_fstring_buffers(tok);
+        newbuf = (char *)PyMem_Realloc(newbuf, newsize);
+        if (newbuf == NULL) {
+            tok->done = E_NOMEM;
+            return 0;
+        }
+        tok->buf = newbuf;
+        tok->cur = tok->buf + cur;
+        tok->inp = tok->buf + oldsize;
+        tok->end = tok->buf + newsize;
+        tok->start = start < 0 ? NULL : tok->buf + start;
+        tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
+        tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
+        _PyLexer_restore_fstring_buffers(tok);
+    }
+    return 1;
+}