gh-119118: Fix performance regression in tokenize module (#119615)

* gh-119118: Fix performance regression in tokenize module

- Cache line object to avoid creating a Unicode object
  for all of the tokens in the same line.
- Speed up byte offset to column offset conversion by using the
  smallest buffer possible to measure the difference.

Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
Lysandros Nikolaou 2024-05-28 21:17:49 +02:00 committed by GitHub
parent ae9140f32a
commit d87b015106
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 68 additions and 4 deletions

View file

@ -148,6 +148,7 @@ int _PyPegen_fill_token(Parser *p);
expr_ty _PyPegen_name_token(Parser *p);
expr_ty _PyPegen_number_token(Parser *p);
void *_PyPegen_string_token(Parser *p);
Py_ssize_t _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset);
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);