[3.9] bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939) (GH-28945)

They support now splitting escape sequences between input chunks.

Add the third parameter "final" in codecs.unicode_escape_decode().
It is True by default to match the former behavior.
(cherry picked from commit c96d1546b1)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Serhiy Storchaka 2021-10-14 20:03:29 +03:00 committed by GitHub
parent 38fadbc5b9
commit 7c722e32bf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 9836 additions and 4890 deletions

View file

@ -6271,9 +6271,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject *
_PyUnicode_DecodeUnicodeEscape(const char *s,
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
{
const char *starts = s;
@ -6286,6 +6287,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
*first_invalid_escape = NULL;
if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}
/* Escaped strings will always be longer than the resulting
@ -6336,7 +6340,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
goto error;
goto incomplete;
}
c = (unsigned char) *s++;
@ -6390,7 +6394,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
count = 8;
message = "truncated \\UXXXXXXXX escape";
hexescape:
for (ch = 0; count && s < end; ++s, --count) {
for (ch = 0; count; ++s, --count) {
if (s >= end) {
goto incomplete;
}
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
@ -6403,12 +6410,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
break;
goto error;
}
}
if (count) {
goto error;
}
/* when we get here, ch is a 32-bit unicode character */
if (ch > MAX_UNICODE) {
@ -6435,14 +6439,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
}
message = "malformed \\N character escape";
if (s < end && *s == '{') {
if (s >= end) {
goto incomplete;
}
if (*s == '{') {
const char *start = ++s;
size_t namelen;
/* look for the closing brace */
while (s < end && *s != '}')
s++;
if (s >= end) {
goto incomplete;
}
namelen = s - start;
if (namelen && s < end) {
if (namelen) {
/* found a name. look it up in the unicode database */
s++;
ch = 0xffffffff; /* in case 'getcode' messes up */
@ -6468,6 +6478,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
continue;
}
incomplete:
if (consumed) {
*consumed = startinpos;
break;
}
error:
endinpos = s-starts;
writer.min_length = end - s + writer.pos;
@ -6496,12 +6511,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
}
PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s,
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors)
const char *errors,
Py_ssize_t *consumed)
{
const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
consumed,
&first_invalid_escape);
if (result == NULL)
return NULL;
@ -6516,6 +6533,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
return result;
}
PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
}
/* Return a Unicode-Escape string version of the Unicode object. */
PyObject *