[3.12] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (#134337)

If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58623) (cherry picked from commit 6279eb8c07)
2025-07-12 05:45:15 +00:00 · 2025-05-26 06:33:22 +03:00 · 2025-05-26 06:33:22 +03:00 · 4398b788ff
commit 4398b788ff
parent 310cd8943a
8 changed files with 194 additions and 57 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6046,13 +6046,15 @@ PyUnicode_AsUTF16String(PyObject *unicode)
 /* --- Unicode Escape Codec ----------------------------------------------- */

 PyObject *
-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
                               Py_ssize_t size,
                               const char *errors,
                               Py_ssize_t *consumed,
-                               const char **first_invalid_escape)
+                               int *first_invalid_escape_char,
+                               const char **first_invalid_escape_ptr)
 {
    const char *starts = s;
+    const char *initial_starts = starts;
    _PyUnicodeWriter writer;
    const char *end;
    PyObject *errorHandler = NULL;
@ -6061,7 +6063,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
    PyInterpreterState *interp = _PyInterpreterState_Get();

    // so we can remember if we've seen an invalid escape char or not
-    *first_invalid_escape = NULL;
+    *first_invalid_escape_char = -1;
+    *first_invalid_escape_ptr = NULL;

    if (size == 0) {
        if (consumed) {
@ -6149,9 +6152,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                }
            }
            if (ch > 0377) {
-                if (*first_invalid_escape == NULL) {
-                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
-                                                    already incremented s. */
+                if (*first_invalid_escape_char == -1) {
+                    *first_invalid_escape_char = ch;
+                    if (starts == initial_starts) {
+                        /* Back up 3 chars, since we've already incremented s. */
+                        *first_invalid_escape_ptr = s - 3;
+                    }
                }
            }
            WRITE_CHAR(ch);
@ -6252,9 +6258,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            goto error;

        default:
-            if (*first_invalid_escape == NULL) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
-                                                already incremented s. */
+            if (*first_invalid_escape_char == -1) {
+                *first_invalid_escape_char = c;
+                if (starts == initial_starts) {
+                    /* Back up one char, since we've already incremented s. */
+                    *first_invalid_escape_ptr = s - 1;
+                }
            }
            WRITE_ASCII_CHAR('\\');
            WRITE_CHAR(c);
@ -6293,24 +6302,40 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
    return NULL;
 }

+// Export for binary compatibility.
+PyObject *
+_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+                               Py_ssize_t size,
+                               const char *errors,
+                               Py_ssize_t *consumed,
+                               const char **first_invalid_escape)
+{
+    int first_invalid_escape_char;
+    return _PyUnicode_DecodeUnicodeEscapeInternal2(
+            s, size, errors, consumed,
+            &first_invalid_escape_char,
+            first_invalid_escape);
+}
+
 PyObject *
 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
                              Py_ssize_t size,
                              const char *errors,
                              Py_ssize_t *consumed)
 {
-    const char *first_invalid_escape;
-    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+    int first_invalid_escape_char;
+    const char *first_invalid_escape_ptr;
+    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
                                                      consumed,
-                                                      &first_invalid_escape);
+                                                      &first_invalid_escape_char,
+                                                      &first_invalid_escape_ptr);
    if (result == NULL)
        return NULL;
-    if (first_invalid_escape != NULL) {
-        unsigned char c = *first_invalid_escape;
-        if ('4' <= c && c <= '7') {
+    if (first_invalid_escape_char != -1) {
+        if (first_invalid_escape_char > 0xff) {
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                                 "invalid octal escape sequence '\\%.3s'",
-                                 first_invalid_escape) < 0)
+                                 "invalid octal escape sequence '\\%o'",
+                                 first_invalid_escape_char) < 0)
            {
                Py_DECREF(result);
                return NULL;
@ -6319,7 +6344,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
        else {
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                                 "invalid escape sequence '\\%c'",
-                                 c) < 0)
+                                 first_invalid_escape_char) < 0)
            {
                Py_DECREF(result);
                return NULL;