[3.12] gh-107450: Check for overflow in the tokenizer and fix overflow test (GH-110832) (#110931)

(cherry picked from commit a1ac5590e0)

Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Filipe Laíns <lains@riseup.net>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Lysandros Nikolaou 2023-10-16 18:59:18 +02:00 committed by GitHub
parent b8e5b1b28a
commit 3b87e520fc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 29 deletions

View file

@ -4,7 +4,6 @@
extern "C" { extern "C" {
#endif #endif
/* Error codes passed around between file input, tokenizer, parser and /* Error codes passed around between file input, tokenizer, parser and
interpreter. This is necessary so we can turn them into Python interpreter. This is necessary so we can turn them into Python
exceptions at a higher level. Note that some errors have a exceptions at a higher level. Note that some errors have a
@ -13,24 +12,25 @@ extern "C" {
the parser only returns E_EOF when it hits EOF immediately, and it the parser only returns E_EOF when it hits EOF immediately, and it
never returns E_OK. */ never returns E_OK. */
#define E_OK 10 /* No error */ #define E_OK 10 /* No error */
#define E_EOF 11 /* End Of File */ #define E_EOF 11 /* End Of File */
#define E_INTR 12 /* Interrupted */ #define E_INTR 12 /* Interrupted */
#define E_TOKEN 13 /* Bad token */ #define E_TOKEN 13 /* Bad token */
#define E_SYNTAX 14 /* Syntax error */ #define E_SYNTAX 14 /* Syntax error */
#define E_NOMEM 15 /* Ran out of memory */ #define E_NOMEM 15 /* Ran out of memory */
#define E_DONE 16 /* Parsing complete */ #define E_DONE 16 /* Parsing complete */
#define E_ERROR 17 /* Execution error */ #define E_ERROR 17 /* Execution error */
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ #define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
#define E_OVERFLOW 19 /* Node had too many children */ #define E_OVERFLOW 19 /* Node had too many children */
#define E_TOODEEP 20 /* Too many indentation levels */ #define E_TOODEEP 20 /* Too many indentation levels */
#define E_DEDENT 21 /* No matching outer block for dedent */ #define E_DEDENT 21 /* No matching outer block for dedent */
#define E_DECODE 22 /* Error in decoding into Unicode */ #define E_DECODE 22 /* Error in decoding into Unicode */
#define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_BADSINGLE 27 /* Ill-formed single statement input */ #define E_BADSINGLE 27 /* Ill-formed single statement input */
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ #define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
#define E_COLUMNOVERFLOW 29 /* Column offset overflow */
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -18,6 +18,12 @@ from test.support.os_helper import TESTFN, unlink
from test.support.warnings_helper import check_warnings from test.support.warnings_helper import check_warnings
from test import support from test import support
try:
from _testcapi import INT_MAX
except ImportError:
INT_MAX = 2**31 - 1
class NaiveException(Exception): class NaiveException(Exception):
def __init__(self, x): def __init__(self, x):
@ -318,11 +324,13 @@ class ExceptionTests(unittest.TestCase):
check('(yield i) = 2', 1, 2) check('(yield i) = 2', 1, 2)
check('def f(*):\n pass', 1, 7) check('def f(*):\n pass', 1, 7)
@unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
@support.requires_resource('cpu') @support.requires_resource('cpu')
@support.bigmemtest(support._2G, memuse=1.5) @support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
def testMemoryErrorBigSource(self, _size): def testMemoryErrorBigSource(self, size):
with self.assertRaises(OverflowError): src = b"if True:\n%*s" % (size, b"pass")
exec(f"if True:\n {' ' * 2**31}print('hello world')") with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
compile(src, '<fragment>', 'exec')
@cpython_only @cpython_only
def testSettingException(self): def testSettingException(self):

View file

@ -66,6 +66,7 @@ _Pypegen_tokenizer_error(Parser *p)
const char *msg = NULL; const char *msg = NULL;
PyObject* errtype = PyExc_SyntaxError; PyObject* errtype = PyExc_SyntaxError;
Py_ssize_t col_offset = -1; Py_ssize_t col_offset = -1;
p->error_indicator = 1;
switch (p->tok->done) { switch (p->tok->done) {
case E_TOKEN: case E_TOKEN:
msg = "invalid token"; msg = "invalid token";
@ -101,6 +102,10 @@ _Pypegen_tokenizer_error(Parser *p)
msg = "unexpected character after line continuation character"; msg = "unexpected character after line continuation character";
break; break;
} }
case E_COLUMNOVERFLOW:
PyErr_SetString(PyExc_OverflowError,
"Parser column offset overflow - source line is too big");
return -1;
default: default:
msg = "unknown parsing error"; msg = "unknown parsing error";
} }
@ -233,12 +238,6 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *err
col_offset = 0; col_offset = 0;
} else { } else {
const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
if (p->tok->cur - start > INT_MAX) {
PyErr_SetString(PyExc_OverflowError,
"Parser column offset overflow - source line is too big");
p->error_indicator = 1;
return NULL;
}
col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
} }
} else { } else {

View file

@ -1366,6 +1366,10 @@ tok_nextc(struct tok_state *tok)
int rc; int rc;
for (;;) { for (;;) {
if (tok->cur != tok->inp) { if (tok->cur != tok->inp) {
if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
tok->done = E_COLUMNOVERFLOW;
return EOF;
}
tok->col_offset++; tok->col_offset++;
return Py_CHARMASK(*tok->cur++); /* Fast path */ return Py_CHARMASK(*tok->cur++); /* Fast path */
} }