[3.13] gh-116042: Fix location for SyntaxErrors of invalid escapes in the tokenizer (GH-116049) (#130066)

(cherry picked from commit 56eda25633) (cherry picked from commit 369704b428)
2025-07-19 17:25:54 +00:00 · 2025-02-13 01:49:25 +00:00 · 2025-02-13 01:49:25 +00:00 · 8d1d36b742
commit 8d1d36b742
parent 4c2a59b7b8
5 changed files with 82 additions and 18 deletions
--- a/Lib/test/test_cmd_line_script.py
+++ b/Lib/test/test_cmd_line_script.py
@ -660,7 +660,7 @@ class CmdLineTest(unittest.TestCase):
            self.assertEqual(
                stderr.splitlines()[-3:],
                [   b'    foo = """\\q"""',
-                    b'          ^^^^^^^^',
+                    b'             ^^',
                    b'SyntaxError: invalid escape sequence \'\\q\''
                ],
            )
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@ -118,7 +118,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(len(w), 1)
        self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'")
        self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('error', category=SyntaxWarning)
@ -128,7 +128,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(w, [])
        self.assertEqual(exc.msg, r"invalid escape sequence '\z'")
        self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
        self.assertEqual(exc.offset, 1)
        # Check that the warning is raised only once if there are syntax errors
@ -155,7 +155,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(str(w[0].message),
                         r"invalid octal escape sequence '\407'")
        self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('error', category=SyntaxWarning)
@ -165,9 +165,32 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(w, [])
        self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'")
        self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
        self.assertEqual(exc.offset, 1)
    def test_invalid_escape_locations_with_offset(self):
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('error', category=SyntaxWarning)
            with self.assertRaises(SyntaxError) as cm:
                eval("\"'''''''''''''''''''''invalid\ Escape\"")
            exc = cm.exception
        self.assertEqual(w, [])
        self.assertEqual(exc.msg, r"invalid escape sequence '\ '")
        self.assertEqual(exc.filename, '<string>')
        self.assertEqual(exc.lineno, 1)
        self.assertEqual(exc.offset, 30)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('error', category=SyntaxWarning)
            with self.assertRaises(SyntaxError) as cm:
                eval("\"''Incorrect \ logic?\"")
            exc = cm.exception
        self.assertEqual(w, [])
        self.assertEqual(exc.msg, r"invalid escape sequence '\ '")
        self.assertEqual(exc.filename, '<string>')
        self.assertEqual(exc.lineno, 1)
        self.assertEqual(exc.offset, 14)
    def test_eval_str_raw(self):
        self.assertEqual(eval(""" r'x' """), 'x')
        self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@ -207,7 +230,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(len(w), 1)
        self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'")
        self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('error', category=SyntaxWarning)
@ -217,7 +240,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(w, [])
        self.assertEqual(exc.msg, r"invalid escape sequence '\z'")
        self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
    def test_eval_bytes_invalid_octal_escape(self):
        for i in range(0o400, 0o1000):
@ -231,7 +254,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(str(w[0].message),
                         r"invalid octal escape sequence '\407'")
        self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('error', category=SyntaxWarning)
@ -241,7 +264,7 @@ class TestLiterals(unittest.TestCase):
        self.assertEqual(w, [])
        self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'")
        self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
    def test_eval_bytes_raw(self):
        self.assertEqual(eval(""" br'x' """), b'x')
--- a/Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst
+++ b/Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst
@ -0,0 +1,2 @@
 Fix location for SyntaxErrors of invalid escapes in the tokenizer. Patch by
 Pablo Galindo
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@ -352,8 +352,8 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
-            Py_ssize_t size = p->tok->inp - p->tok->buf;
+            Py_ssize_t size = p->tok->inp - p->tok->line_start;
-            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
        }
        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@ -11,7 +11,7 @@
 //// STRING HANDLING FUNCTIONS ////
 static int
-warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
+warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t)
 {
    if (p->call_invalid_rules) {
        // Do not report warnings if we are in the second pass of the parser
@ -41,8 +41,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
    else {
        category = PyExc_DeprecationWarning;
    }
    // Calculate the lineno and the col_offset of the invalid escape sequence
    const char *start = buffer;
    const char *end = first_invalid_escape;
    int lineno = t->lineno;
    int col_offset = t->col_offset;
    while (start < end) {
        if (*start == '\n') {
            lineno++;
            col_offset = 0;
        }
        else {
            col_offset++;
        }
        start++;
    }
    // Count the number of quotes in the token
    char first_quote = 0;
    if (lineno == t->lineno) {
        int quote_count = 0;
        char* tok = PyBytes_AsString(t->bytes);
        for (int i = 0; i < PyBytes_Size(t->bytes); i++) {
            if (tok[i] == '\'' || tok[i] == '\"') {
                if (quote_count == 0) {
                    first_quote = tok[i];
                }
                if (tok[i] == first_quote) {
                    quote_count++;
                }
            } else {
                break;
            }
        }
        col_offset += quote_count;
    }
    if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
-                                 t->lineno, NULL, NULL) < 0) {
+                                 lineno, NULL, NULL) < 0) {
        if (PyErr_ExceptionMatches(category)) {
            /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
               to get a more accurate error report */
@ -53,11 +91,12 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
               error location, if p->known_err_token is not set. */
            p->known_err_token = t;
            if (octal) {
-                RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
-                                   first_invalid_escape);
+                "invalid octal escape sequence '\\%.3s'", first_invalid_escape);
            }
            else {
-                RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
                "invalid escape sequence '\\%c'", c);
            }
        }
        Py_DECREF(msg);
@ -151,7 +190,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
    // HACK: later we can simply pass the line no, since we don't preserve the tokens
    // when we are decoding the string but we preserve the line numbers.
    if (v != NULL && first_invalid_escape != NULL && t != NULL) {
-        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
            /* We have not decref u before because first_invalid_escape points
               inside u. */
            Py_XDECREF(u);
@ -173,7 +212,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
    }
    if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
            Py_DECREF(result);
            return NULL;
        }
		`@ -0,0 +1,2 @@`
							`Fix location for SyntaxErrors of invalid escapes in the tokenizer. Patch by`
							`Pablo Galindo`