gh-119118: Fix performance regression in tokenize module (#119615)

* gh-119118: Fix performance regression in tokenize module - Cache line object to avoid creating a Unicode object for all of the tokens in the same line. - Speed up byte offset to column offset conversion by using the smallest buffer possible to measure the difference. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
2025-08-30 21:48:47 +00:00 · 2024-05-28 21:17:49 +02:00 · 2024-05-28 21:17:49 +02:00 · d87b015106
commit d87b015106
parent ae9140f32a
4 changed files with 68 additions and 4 deletions
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@ -148,6 +148,7 @@ int _PyPegen_fill_token(Parser *p);
 expr_ty _PyPegen_name_token(Parser *p);
 expr_ty _PyPegen_number_token(Parser *p);
 void *_PyPegen_string_token(Parser *p);
+Py_ssize_t _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);