bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)

This commit is contained in:
Serhiy Storchaka 2020-05-12 12:42:04 +03:00 committed by GitHub
parent f3a5b7ada0
commit 74ea6b5a75
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 92 additions and 45 deletions

View file

@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
case E_TOKEN:
msg = "invalid token";
break;
case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_EOFS:
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
return -1;

View file

@ -1101,25 +1101,53 @@ static int
verify_identifier(struct tok_state *tok)
{
PyObject *s;
int result;
if (tok->decoding_erred)
return 0;
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
if (s == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
PyErr_Clear();
tok->done = E_IDENTIFIER;
} else {
tok->done = E_DECODE;
}
else {
tok->done = E_ERROR;
}
return 0;
}
result = PyUnicode_IsIdentifier(s);
Py_DECREF(s);
if (result == 0) {
tok->done = E_IDENTIFIER;
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
if (invalid < 0) {
Py_DECREF(s);
tok->done = E_ERROR;
return 0;
}
return result;
assert(PyUnicode_GET_LENGTH(s) > 0);
if (invalid < PyUnicode_GET_LENGTH(s)) {
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
/* Determine the offset in UTF-8 encoded input */
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
if (s != NULL) {
Py_SETREF(s, PyUnicode_AsUTF8String(s));
}
if (s == NULL) {
tok->done = E_ERROR;
return 0;
}
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
}
Py_DECREF(s);
// PyUnicode_FromFormatV() does not support %X
char hex[9];
snprintf(hex, sizeof(hex), "%04X", ch);
if (Py_UNICODE_ISPRINTABLE(ch)) {
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
}
else {
syntaxerror(tok, "invalid non-printable character U+%s", hex);
}
return 0;
}
Py_DECREF(s);
return 1;
}
static int