bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) (GH-28953)

They support now splitting escape sequences between input chunks.

Add the third parameter "final" in codecs.raw_unicode_escape_decode().
It is True by default to match the former behavior.

(cherry picked from commit 39aa98346d)
This commit is contained in:
Serhiy Storchaka 2021-10-14 21:23:52 +03:00 committed by GitHub
parent 7c722e32bf
commit 6848602806
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 444 additions and 352 deletions

View file

@ -6308,8 +6308,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message;
#define WRITE_ASCII_CHAR(ch) \
@ -6336,7 +6334,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
continue;
}
startinpos = s - starts - 1;
Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
@ -6483,8 +6481,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
*consumed = startinpos;
break;
}
error:
endinpos = s-starts;
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
@ -6679,9 +6677,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
/* --- Raw Unicode Escape Codec ------------------------------------------- */
PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
_PyUnicodeWriter writer;
@ -6690,6 +6689,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *exc = NULL;
if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}
@ -6708,8 +6710,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message;
#define WRITE_CHAR(ch) \
@ -6724,11 +6724,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
} while(0)
/* Non-escape characters are interpreted as Unicode ordinals */
if (c != '\\' || s >= end) {
if (c != '\\' || (s >= end && !consumed)) {
WRITE_CHAR(c);
continue;
}
Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
assert(consumed);
// Set message to silent compiler warning.
// Actually it is never used.
message = "\\ at end of string";
goto incomplete;
}
c = (unsigned char) *s++;
if (c == 'u') {
count = 4;
@ -6744,10 +6754,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
WRITE_CHAR(c);
continue;
}
startinpos = s - starts - 2;
/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
for (ch = 0; count && s < end; ++s, --count) {
for (ch = 0; count; ++s, --count) {
if (s >= end) {
goto incomplete;
}
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
@ -6760,18 +6772,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
break;
goto error;
}
}
if (!count) {
if (ch <= MAX_UNICODE) {
WRITE_CHAR(ch);
continue;
}
if (ch > MAX_UNICODE) {
message = "\\Uxxxxxxxx out of range";
goto error;
}
WRITE_CHAR(ch);
continue;
endinpos = s-starts;
incomplete:
if (consumed) {
*consumed = startinpos;
break;
}
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
@ -6793,7 +6810,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
}
PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
}