mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) (GH-28953)
They support now splitting escape sequences between input chunks.
Add the third parameter "final" in codecs.raw_unicode_escape_decode().
It is True by default to match the former behavior.
(cherry picked from commit 39aa98346d
)
This commit is contained in:
parent
7c722e32bf
commit
6848602806
8 changed files with 444 additions and 352 deletions
|
@ -6308,8 +6308,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
|
|||
unsigned char c = (unsigned char) *s++;
|
||||
Py_UCS4 ch;
|
||||
int count;
|
||||
Py_ssize_t startinpos;
|
||||
Py_ssize_t endinpos;
|
||||
const char *message;
|
||||
|
||||
#define WRITE_ASCII_CHAR(ch) \
|
||||
|
@ -6336,7 +6334,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
|
|||
continue;
|
||||
}
|
||||
|
||||
startinpos = s - starts - 1;
|
||||
Py_ssize_t startinpos = s - starts - 1;
|
||||
/* \ - Escapes */
|
||||
if (s >= end) {
|
||||
message = "\\ at end of string";
|
||||
|
@ -6483,8 +6481,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
|
|||
*consumed = startinpos;
|
||||
break;
|
||||
}
|
||||
error:
|
||||
endinpos = s-starts;
|
||||
error:;
|
||||
Py_ssize_t endinpos = s-starts;
|
||||
writer.min_length = end - s + writer.pos;
|
||||
if (unicode_decode_call_errorhandler_writer(
|
||||
errors, &errorHandler,
|
||||
|
@ -6679,9 +6677,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
|
|||
/* --- Raw Unicode Escape Codec ------------------------------------------- */
|
||||
|
||||
PyObject *
|
||||
PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors)
|
||||
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
const char *starts = s;
|
||||
_PyUnicodeWriter writer;
|
||||
|
@ -6690,6 +6689,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
PyObject *exc = NULL;
|
||||
|
||||
if (size == 0) {
|
||||
if (consumed) {
|
||||
*consumed = 0;
|
||||
}
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
|
@ -6708,8 +6710,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
unsigned char c = (unsigned char) *s++;
|
||||
Py_UCS4 ch;
|
||||
int count;
|
||||
Py_ssize_t startinpos;
|
||||
Py_ssize_t endinpos;
|
||||
const char *message;
|
||||
|
||||
#define WRITE_CHAR(ch) \
|
||||
|
@ -6724,11 +6724,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
} while(0)
|
||||
|
||||
/* Non-escape characters are interpreted as Unicode ordinals */
|
||||
if (c != '\\' || s >= end) {
|
||||
if (c != '\\' || (s >= end && !consumed)) {
|
||||
WRITE_CHAR(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
Py_ssize_t startinpos = s - starts - 1;
|
||||
/* \ - Escapes */
|
||||
if (s >= end) {
|
||||
assert(consumed);
|
||||
// Set message to silent compiler warning.
|
||||
// Actually it is never used.
|
||||
message = "\\ at end of string";
|
||||
goto incomplete;
|
||||
}
|
||||
|
||||
c = (unsigned char) *s++;
|
||||
if (c == 'u') {
|
||||
count = 4;
|
||||
|
@ -6744,10 +6754,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
WRITE_CHAR(c);
|
||||
continue;
|
||||
}
|
||||
startinpos = s - starts - 2;
|
||||
|
||||
/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
|
||||
for (ch = 0; count && s < end; ++s, --count) {
|
||||
for (ch = 0; count; ++s, --count) {
|
||||
if (s >= end) {
|
||||
goto incomplete;
|
||||
}
|
||||
c = (unsigned char)*s;
|
||||
ch <<= 4;
|
||||
if (c >= '0' && c <= '9') {
|
||||
|
@ -6760,18 +6772,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
ch += c - ('A' - 10);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
if (!count) {
|
||||
if (ch <= MAX_UNICODE) {
|
||||
WRITE_CHAR(ch);
|
||||
continue;
|
||||
}
|
||||
if (ch > MAX_UNICODE) {
|
||||
message = "\\Uxxxxxxxx out of range";
|
||||
goto error;
|
||||
}
|
||||
WRITE_CHAR(ch);
|
||||
continue;
|
||||
|
||||
endinpos = s-starts;
|
||||
incomplete:
|
||||
if (consumed) {
|
||||
*consumed = startinpos;
|
||||
break;
|
||||
}
|
||||
error:;
|
||||
Py_ssize_t endinpos = s-starts;
|
||||
writer.min_length = end - s + writer.pos;
|
||||
if (unicode_decode_call_errorhandler_writer(
|
||||
errors, &errorHandler,
|
||||
|
@ -6793,7 +6810,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
Py_XDECREF(errorHandler);
|
||||
Py_XDECREF(exc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors)
|
||||
{
|
||||
return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue