[3.12] gh-116042: Fix location for SyntaxErrors of invalid escapes in the tokenizer (GH-116049) (#130065)

(cherry picked from commit 56eda25633)
2025-07-24 11:44:31 +00:00 · 2025-02-13 01:42:24 +00:00 · 2025-02-13 01:42:24 +00:00 · 5e8a9eb13d
commit 5e8a9eb13d
parent 719d08cccf
5 changed files with 82 additions and 18 deletions
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@ -9,7 +9,7 @@
 //// STRING HANDLING FUNCTIONS ////

 static int
-warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
+warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t)
 {
    if (p->call_invalid_rules) {
        // Do not report warnings if we are in the second pass of the parser
@ -38,8 +38,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
    else {
        category = PyExc_DeprecationWarning;
    }
+
+    // Calculate the lineno and the col_offset of the invalid escape sequence
+    const char *start = buffer;
+    const char *end = first_invalid_escape;
+    int lineno = t->lineno;
+    int col_offset = t->col_offset;
+    while (start < end) {
+        if (*start == '\n') {
+            lineno++;
+            col_offset = 0;
+        }
+        else {
+            col_offset++;
+        }
+        start++;
+    }
+
+    // Count the number of quotes in the token
+    char first_quote = 0;
+    if (lineno == t->lineno) {
+        int quote_count = 0;
+        char* tok = PyBytes_AsString(t->bytes);
+        for (int i = 0; i < PyBytes_Size(t->bytes); i++) {
+            if (tok[i] == '\'' || tok[i] == '\"') {
+                if (quote_count == 0) {
+                    first_quote = tok[i];
+                }
+                if (tok[i] == first_quote) {
+                    quote_count++;
+                }
+            } else {
+                break;
+            }
+        }
+
+        col_offset += quote_count;
+    }
+
    if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
-                                 t->lineno, NULL, NULL) < 0) {
+                                 lineno, NULL, NULL) < 0) {
        if (PyErr_ExceptionMatches(category)) {
            /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
               to get a more accurate error report */
@ -50,11 +88,12 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
               error location, if p->known_err_token is not set. */
            p->known_err_token = t;
            if (octal) {
-                RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
-                                   first_invalid_escape);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
+                "invalid octal escape sequence '\\%.3s'", first_invalid_escape);
            }
            else {
-                RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
+                "invalid escape sequence '\\%c'", c);
            }
        }
        Py_DECREF(msg);
@ -148,7 +187,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
    // HACK: later we can simply pass the line no, since we don't preserve the tokens
    // when we are decoding the string but we preserve the line numbers.
    if (v != NULL && first_invalid_escape != NULL && t != NULL) {
-        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
            /* We have not decref u before because first_invalid_escape points
               inside u. */
            Py_XDECREF(u);
@ -170,7 +209,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
    }

    if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
            Py_DECREF(result);
            return NULL;
        }