[3.10] bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939) (GH-28943)

They support now splitting escape sequences between input chunks.

Add the third parameter "final" in codecs.unicode_escape_decode().
It is True by default to match the former behavior.
(cherry picked from commit c96d1546b1)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2021-10-14 10:02:20 -07:00 committed by GitHub
parent 70b150a366
commit 0bff4ccbfd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 10949 additions and 5895 deletions

View file

@ -6408,9 +6408,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
PyObject *
_PyUnicode_DecodeUnicodeEscape(const char *s,
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
{
const char *starts = s;
@ -6423,6 +6424,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
*first_invalid_escape = NULL;
if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}
/* Escaped strings will always be longer than the resulting
@ -6473,7 +6477,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
goto error;
goto incomplete;
}
c = (unsigned char) *s++;
@ -6527,7 +6531,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
count = 8;
message = "truncated \\UXXXXXXXX escape";
hexescape:
for (ch = 0; count && s < end; ++s, --count) {
for (ch = 0; count; ++s, --count) {
if (s >= end) {
goto incomplete;
}
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
@ -6540,12 +6547,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
break;
goto error;
}
}
if (count) {
goto error;
}
/* when we get here, ch is a 32-bit unicode character */
if (ch > MAX_UNICODE) {
@ -6572,14 +6576,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
}
message = "malformed \\N character escape";
if (s < end && *s == '{') {
if (s >= end) {
goto incomplete;
}
if (*s == '{') {
const char *start = ++s;
size_t namelen;
/* look for the closing brace */
while (s < end && *s != '}')
s++;
if (s >= end) {
goto incomplete;
}
namelen = s - start;
if (namelen && s < end) {
if (namelen) {
/* found a name. look it up in the unicode database */
s++;
ch = 0xffffffff; /* in case 'getcode' messes up */
@ -6605,6 +6615,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
continue;
}
incomplete:
if (consumed) {
*consumed = startinpos;
break;
}
error:
endinpos = s-starts;
writer.min_length = end - s + writer.pos;
@ -6633,12 +6648,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
}
PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s,
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors)
const char *errors,
Py_ssize_t *consumed)
{
const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
consumed,
&first_invalid_escape);
if (result == NULL)
return NULL;
@ -6653,6 +6670,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
return result;
}
PyObject *
PyUnicode_DecodeUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
}
/* Return a Unicode-Escape string version of the Unicode object. */
PyObject *