bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) (GH-28953)

They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior. (cherry picked from commit 39aa98346d)
2025-08-04 00:48:58 +00:00 · 2021-10-14 21:23:52 +03:00 · 2021-10-14 21:23:52 +03:00 · 6848602806
commit 6848602806
parent 7c722e32bf
8 changed files with 444 additions and 352 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6308,8 +6308,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
        unsigned char c = (unsigned char) *s++;
        Py_UCS4 ch;
        int count;
-        Py_ssize_t startinpos;
-        Py_ssize_t endinpos;
        const char *message;

 #define WRITE_ASCII_CHAR(ch)                                                  \
@ -6336,7 +6334,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            continue;
        }

-        startinpos = s - starts - 1;
+        Py_ssize_t startinpos = s - starts - 1;
        /* \ - Escapes */
        if (s >= end) {
            message = "\\ at end of string";
@ -6483,8 +6481,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            *consumed = startinpos;
            break;
        }
-      error:
-        endinpos = s-starts;
+      error:;
+        Py_ssize_t endinpos = s-starts;
        writer.min_length = end - s + writer.pos;
        if (unicode_decode_call_errorhandler_writer(
                errors, &errorHandler,
@ -6679,9 +6677,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
 /* --- Raw Unicode Escape Codec ------------------------------------------- */

 PyObject *
-PyUnicode_DecodeRawUnicodeEscape(const char *s,
-                                 Py_ssize_t size,
-                                 const char *errors)
+_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
+                                          Py_ssize_t size,
+                                          const char *errors,
+                                          Py_ssize_t *consumed)
 {
    const char *starts = s;
    _PyUnicodeWriter writer;
@ -6690,6 +6689,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
    PyObject *exc = NULL;

    if (size == 0) {
+        if (consumed) {
+            *consumed = 0;
+        }
        _Py_RETURN_UNICODE_EMPTY();
    }

@ -6708,8 +6710,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
        unsigned char c = (unsigned char) *s++;
        Py_UCS4 ch;
        int count;
-        Py_ssize_t startinpos;
-        Py_ssize_t endinpos;
        const char *message;

 #define WRITE_CHAR(ch)                                                        \
@ -6724,11 +6724,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
            } while(0)

        /* Non-escape characters are interpreted as Unicode ordinals */
-        if (c != '\\' || s >= end) {
+        if (c != '\\' || (s >= end && !consumed)) {
            WRITE_CHAR(c);
            continue;
        }

+        Py_ssize_t startinpos = s - starts - 1;
+        /* \ - Escapes */
+        if (s >= end) {
+            assert(consumed);
+            // Set message to silent compiler warning.
+            // Actually it is never used.
+            message = "\\ at end of string";
+            goto incomplete;
+        }
+
        c = (unsigned char) *s++;
        if (c == 'u') {
            count = 4;
@ -6744,10 +6754,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
            WRITE_CHAR(c);
            continue;
        }
-        startinpos = s - starts - 2;

        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
-        for (ch = 0; count && s < end; ++s, --count) {
+        for (ch = 0; count; ++s, --count) {
+            if (s >= end) {
+                goto incomplete;
+            }
            c = (unsigned char)*s;
            ch <<= 4;
            if (c >= '0' && c <= '9') {
@ -6760,18 +6772,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
                ch += c - ('A' - 10);
            }
            else {
-                break;
+                goto error;
            }
        }
-        if (!count) {
-            if (ch <= MAX_UNICODE) {
-                WRITE_CHAR(ch);
-                continue;
-            }
+        if (ch > MAX_UNICODE) {
            message = "\\Uxxxxxxxx out of range";
+            goto error;
        }
+        WRITE_CHAR(ch);
+        continue;

-        endinpos = s-starts;
+      incomplete:
+        if (consumed) {
+            *consumed = startinpos;
+            break;
+        }
+      error:;
+        Py_ssize_t endinpos = s-starts;
        writer.min_length = end - s + writer.pos;
        if (unicode_decode_call_errorhandler_writer(
                errors, &errorHandler,
@ -6793,7 +6810,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;
+}

+PyObject *
+PyUnicode_DecodeRawUnicodeEscape(const char *s,
+                                 Py_ssize_t size,
+                                 const char *errors)
+{
+    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
 }