gh-112943: Correctly compute end offsets for multiline tokens in the tokenize module (#112949)

2025-09-26 10:19:53 +00:00 · 2023-12-11 11:44:22 +00:00 · 2023-12-11 11:44:22 +00:00 · a135a6d2c6
commit a135a6d2c6
parent 4c5b9c107a
5 changed files with 25 additions and 6 deletions
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@ -19,12 +19,8 @@ _PyPegen_interactive_exit(Parser *p)
 }

 Py_ssize_t
-_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
 {
-    const char *str = PyUnicode_AsUTF8(line);
-    if (!str) {
-        return -1;
-    }
    Py_ssize_t len = strlen(str);
    if (col_offset > len + 1) {
        col_offset = len + 1;
@ -39,6 +35,16 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
    return size;
 }

+Py_ssize_t
+_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+{
+    const char *str = PyUnicode_AsUTF8(line);
+    if (!str) {
+        return -1;
+    }
+    return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
+}
+
 // Here, mark is the start of the node, while p->mark is the end.
 // If node==NULL, they should be the same.
 int