bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)

2025-07-29 14:15:07 +00:00 · 2020-05-12 12:42:04 +03:00 · 2020-05-12 12:42:04 +03:00 · 74ea6b5a75
commit 74ea6b5a75
parent f3a5b7ada0
10 changed files with 92 additions and 45 deletions
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
        case E_TOKEN:
            msg = "invalid token";
            break;
-        case E_IDENTIFIER:
-            msg = "invalid character in identifier";
-            break;
        case E_EOFS:
            RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
            return -1;
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1101,25 +1101,53 @@ static int
 verify_identifier(struct tok_state *tok)
 {
    PyObject *s;
-    int result;
    if (tok->decoding_erred)
        return 0;
    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
    if (s == NULL) {
        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
-            PyErr_Clear();
-            tok->done = E_IDENTIFIER;
-        } else {
+            tok->done = E_DECODE;
+        }
+        else {
            tok->done = E_ERROR;
        }
        return 0;
    }
-    result = PyUnicode_IsIdentifier(s);
-    Py_DECREF(s);
-    if (result == 0) {
-        tok->done = E_IDENTIFIER;
+    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
+    if (invalid < 0) {
+        Py_DECREF(s);
+        tok->done = E_ERROR;
+        return 0;
    }
-    return result;
+    assert(PyUnicode_GET_LENGTH(s) > 0);
+    if (invalid < PyUnicode_GET_LENGTH(s)) {
+        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
+        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
+            /* Determine the offset in UTF-8 encoded input */
+            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
+            if (s != NULL) {
+                Py_SETREF(s, PyUnicode_AsUTF8String(s));
+            }
+            if (s == NULL) {
+                tok->done = E_ERROR;
+                return 0;
+            }
+            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
+        }
+        Py_DECREF(s);
+        // PyUnicode_FromFormatV() does not support %X
+        char hex[9];
+        snprintf(hex, sizeof(hex), "%04X", ch);
+        if (Py_UNICODE_ISPRINTABLE(ch)) {
+            syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+        }
+        else {
+            syntaxerror(tok, "invalid non-printable character U+%s", hex);
+        }
+        return 0;
+    }
+    Py_DECREF(s);
+    return 1;
 }

 static int