[3.10] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (GH-134345)

If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58623) (cherry picked from commit 6279eb8c07) (cherry picked from commit a75953b347) (cherry picked from commit 0c33e5baed) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-07-12 13:55:34 +00:00 · 2025-06-02 18:55:48 +03:00 · 2025-06-02 18:55:48 +03:00 · ab9893c406
commit ab9893c406
parent f85e71a008
8 changed files with 163 additions and 40 deletions
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -1089,10 +1089,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
 }

 /* Unescape a backslash-escaped string. */
-PyObject *_PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape2(const char *s,
                                Py_ssize_t len,
                                const char *errors,
-                                const char **first_invalid_escape)
+                                int *first_invalid_escape_char,
+                                const char **first_invalid_escape_ptr)
 {
    int c;
    char *p;
@ -1106,7 +1107,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
        return NULL;
    writer.overallocate = 1;

-    *first_invalid_escape = NULL;
+    *first_invalid_escape_char = -1;
+    *first_invalid_escape_ptr = NULL;

    end = s + len;
    while (s < end) {
@ -1181,9 +1183,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
            break;

        default:
-            if (*first_invalid_escape == NULL) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
-                                                already incremented s. */
+            if (*first_invalid_escape_char == -1) {
+                *first_invalid_escape_char = (unsigned char)s[-1];
+                /* Back up one char, since we've already incremented s. */
+                *first_invalid_escape_ptr = s - 1;
            }
            *p++ = '\\';
            s--;
@ -1197,21 +1200,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
    return NULL;
 }

+// Export for binary compatibility.
+PyObject *_PyBytes_DecodeEscape(const char *s,
+                                Py_ssize_t len,
+                                const char *errors,
+                                const char **first_invalid_escape)
+{
+    int first_invalid_escape_char;
+    return _PyBytes_DecodeEscape2(
+            s, len, errors,
+            &first_invalid_escape_char,
+            first_invalid_escape);
+}
+
 PyObject *PyBytes_DecodeEscape(const char *s,
                                Py_ssize_t len,
                                const char *errors,
                                Py_ssize_t Py_UNUSED(unicode),
                                const char *Py_UNUSED(recode_encoding))
 {
-    const char* first_invalid_escape;
-    PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
-                                             &first_invalid_escape);
+    int first_invalid_escape_char;
+    const char *first_invalid_escape_ptr;
+    PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
+                                             &first_invalid_escape_char,
+                                             &first_invalid_escape_ptr);
    if (result == NULL)
        return NULL;
-    if (first_invalid_escape != NULL) {
+    if (first_invalid_escape_char != -1) {
        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                             "invalid escape sequence '\\%c'",
-                             (unsigned char)*first_invalid_escape) < 0) {
+                             first_invalid_escape_char) < 0) {
            Py_DECREF(result);
            return NULL;
        }