gh-107450: Check for overflow in the tokenizer and fix overflow test (#110832)

Co-authored-by: Filipe Laíns <lains@riseup.net> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-11-20 02:50:14 +00:00 · 2023-10-16 16:42:49 +02:00 · 2023-10-16 16:42:49 +02:00 · a1ac5590e0
commit a1ac5590e0
parent b3c9faf056
4 changed files with 40 additions and 22 deletions
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@ -59,6 +59,10 @@ tok_nextc(struct tok_state *tok)
    int rc;
    for (;;) {
        if (tok->cur != tok->inp) {
+            if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
+                tok->done = E_COLUMNOVERFLOW;
+                return EOF;
+            }
            tok->col_offset++;
            return Py_CHARMASK(*tok->cur++); /* Fast path */
        }
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@ -68,6 +68,7 @@ _Pypegen_tokenizer_error(Parser *p)
    const char *msg = NULL;
    PyObject* errtype = PyExc_SyntaxError;
    Py_ssize_t col_offset = -1;
+    p->error_indicator = 1;
    switch (p->tok->done) {
        case E_TOKEN:
            msg = "invalid token";
@ -103,6 +104,10 @@ _Pypegen_tokenizer_error(Parser *p)
            msg = "unexpected character after line continuation character";
            break;
        }
+        case E_COLUMNOVERFLOW:
+            PyErr_SetString(PyExc_OverflowError,
+                    "Parser column offset overflow - source line is too big");
+            return -1;
        default:
            msg = "unknown parsing error";
    }