[3.9] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (#134346)

* [3.9] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58623) (cherry picked from commit 6279eb8c07) (cherry picked from commit a75953b347) (cherry picked from commit 0c33e5baed) (cherry picked from commit 8b528cacbb) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-08-04 08:59:19 +00:00 · 2025-06-02 18:58:01 +03:00 · 2025-06-02 18:58:01 +03:00 · 8d35fd1b34
commit 8d35fd1b34
parent d4df3c55e4
8 changed files with 163 additions and 40 deletions
--- a/Include/cpython/bytesobject.h
+++ b/Include/cpython/bytesobject.h
@ -25,6 +25,10 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
    int use_bytearray);
 /* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
 PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
                                             const char *,
                                             int *, const char **);
 // Export for binary compatibility.
 PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
                                             const char *, const char **);
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@ -866,6 +866,19 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
 );
 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
   chars. */
 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
    const char *string,     /* Unicode-Escape encoded string */
    Py_ssize_t length,      /* size of string */
    const char *errors,     /* error handling */
    Py_ssize_t *consumed,   /* bytes consumed */
    int *first_invalid_escape_char, /* on return, if not -1, contain the first
                                       invalid escaped char (<= 0xff) or invalid
                                       octal escape (> 0xff) in string. */
    const char **first_invalid_escape_ptr); /* on return, if not NULL, may
                                        point to the first invalid escaped
                                        char in string.
                                        May be NULL if errors is not NULL. */
 // Export for binary compatibility.
 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
        const char *string,     /* Unicode-Escape encoded string */
        Py_ssize_t length,      /* size of string */
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@ -1124,7 +1124,7 @@ class CodecCallbackTest(unittest.TestCase):
            text = 'abc<def>ghi'*n
            text.translate(charmap)
-    def test_mutatingdecodehandler(self):
+    def test_mutating_decode_handler(self):
        baddata = [
            ("ascii", b"\xff"),
            ("utf-7", b"++"),
@ -1159,6 +1159,40 @@ class CodecCallbackTest(unittest.TestCase):
        for (encoding, data) in baddata:
            self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
    def test_mutating_decode_handler_unicode_escape(self):
        decode = codecs.unicode_escape_decode
        def mutating(exc):
            if isinstance(exc, UnicodeDecodeError):
                r = data.get(exc.object[:exc.end])
                if r is not None:
                    exc.object = r[0] + exc.object[exc.end:]
                    return ('\u0404', r[1])
            raise AssertionError("don't know how to handle %r" % exc)
        codecs.register_error('test.mutating2', mutating)
        data = {
            br'\x0': (b'\\', 0),
            br'\x3': (b'xxx\\', 3),
            br'\x5': (b'x\\', 1),
        }
        def check(input, expected, msg):
            with self.assertWarns(DeprecationWarning) as cm:
                self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
            self.assertIn(msg, str(cm.warning))
        check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
        check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
        check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
        check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
        check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
        check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
        check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
        check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
        check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
        check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
    # issue32583
    def test_crashing_decode_handler(self):
        # better generating one more character to fill the extra space slot
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1178,20 +1178,32 @@ class EscapeDecodeTest(unittest.TestCase):
        check(br"[\501]", b"[A]")
        check(br"[\x41]", b"[A]")
        check(br"[\x410]", b"[A0]")
    def test_warnings(self):
        decode = codecs.escape_decode
        check = coding_checker(self, decode)
        for i in range(97, 123):
            b = bytes([i])
            if b not in b'abfnrtvx':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
                        r"invalid escape sequence '\\%c'" % i):
                    check(b"\\" + b, b"\\" + b)
-            with self.assertWarns(DeprecationWarning):
+            with self.assertWarnsRegex(DeprecationWarning,
                    r"invalid escape sequence '\\%c'" % (i-32)):
                check(b"\\" + b.upper(), b"\\" + b.upper())
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
                r"invalid escape sequence '\\8'"):
            check(br"\8", b"\\8")
        with self.assertWarns(DeprecationWarning):
            check(br"\9", b"\\9")
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
                r"invalid escape sequence '\\\xfa'") as cm:
            check(b"\\\xfa", b"\\\xfa")
        with self.assertWarnsRegex(DeprecationWarning,
                r"invalid escape sequence '\\z'"):
            self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
    def test_errors(self):
        decode = codecs.escape_decode
        self.assertRaises(ValueError, decode, br"\x")
@ -2393,20 +2405,31 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase):
        check(br"[\x410]", "[A0]")
        check(br"\u20ac", "\u20ac")
        check(br"\U0001d120", "\U0001d120")
    def test_decode_warnings(self):
        decode = codecs.unicode_escape_decode
        check = coding_checker(self, decode)
        for i in range(97, 123):
            b = bytes([i])
            if b not in b'abfnrtuvx':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
                        r"invalid escape sequence '\\%c'" % i):
                    check(b"\\" + b, "\\" + chr(i))
            if b.upper() not in b'UN':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarnsRegex(DeprecationWarning,
                        r"invalid escape sequence '\\%c'" % (i-32)):
                    check(b"\\" + b.upper(), "\\" + chr(i-32))
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
                r"invalid escape sequence '\\8'"):
            check(br"\8", "\\8")
        with self.assertWarns(DeprecationWarning):
            check(br"\9", "\\9")
-        with self.assertWarns(DeprecationWarning):
+        with self.assertWarnsRegex(DeprecationWarning,
                r"invalid escape sequence '\\\xfa'") as cm:
            check(b"\\\xfa", "\\\xfa")
        with self.assertWarnsRegex(DeprecationWarning,
                r"invalid escape sequence '\\z'"):
            self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
    def test_decode_errors(self):
        decode = codecs.unicode_escape_decode
--- a/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
+++ b/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
@ -0,0 +1,2 @@
 Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
 handler.
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -1060,10 +1060,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
 }
 /* Unescape a backslash-escaped string. */
-PyObject *_PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape2(const char *s,
                                Py_ssize_t len,
                                const char *errors,
-                                const char **first_invalid_escape)
+                                int *first_invalid_escape_char,
                                const char **first_invalid_escape_ptr)
 {
    int c;
    char *p;
@ -1077,7 +1078,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
        return NULL;
    writer.overallocate = 1;
-    *first_invalid_escape = NULL;
+    *first_invalid_escape_char = -1;
    *first_invalid_escape_ptr = NULL;
    end = s + len;
    while (s < end) {
@ -1152,9 +1154,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
            break;
        default:
-            if (*first_invalid_escape == NULL) {
+            if (*first_invalid_escape_char == -1) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
+                *first_invalid_escape_char = (unsigned char)s[-1];
-                                                already incremented s. */
+                /* Back up one char, since we've already incremented s. */
                *first_invalid_escape_ptr = s - 1;
            }
            *p++ = '\\';
            s--;
@ -1168,21 +1171,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
    return NULL;
 }
 // Export for binary compatibility.
 PyObject *_PyBytes_DecodeEscape(const char *s,
                                Py_ssize_t len,
                                const char *errors,
                                const char **first_invalid_escape)
 {
    int first_invalid_escape_char;
    return _PyBytes_DecodeEscape2(
            s, len, errors,
            &first_invalid_escape_char,
            first_invalid_escape);
 }
 PyObject *PyBytes_DecodeEscape(const char *s,
                                Py_ssize_t len,
                                const char *errors,
                                Py_ssize_t Py_UNUSED(unicode),
                                const char *Py_UNUSED(recode_encoding))
 {
-    const char* first_invalid_escape;
+    int first_invalid_escape_char;
-    PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
+    const char *first_invalid_escape_ptr;
-                                             &first_invalid_escape);
+    PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
                                             &first_invalid_escape_char,
                                             &first_invalid_escape_ptr);
    if (result == NULL)
        return NULL;
-    if (first_invalid_escape != NULL) {
+    if (first_invalid_escape_char != -1) {
        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                             "invalid escape sequence '\\%c'",
-                             (unsigned char)*first_invalid_escape) < 0) {
+                             first_invalid_escape_char) < 0) {
            Py_DECREF(result);
            return NULL;
        }
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6278,20 +6278,23 @@ PyUnicode_AsUTF16String(PyObject *unicode)
 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
 PyObject *
-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
                               Py_ssize_t size,
                               const char *errors,
                               Py_ssize_t *consumed,
-                               const char **first_invalid_escape)
+                               int *first_invalid_escape_char,
                               const char **first_invalid_escape_ptr)
 {
    const char *starts = s;
    const char *initial_starts = starts;
    _PyUnicodeWriter writer;
    const char *end;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    // so we can remember if we've seen an invalid escape char or not
-    *first_invalid_escape = NULL;
+    *first_invalid_escape_char = -1;
    *first_invalid_escape_ptr = NULL;
    if (size == 0) {
        if (consumed) {
@ -6474,9 +6477,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            goto error;
        default:
-            if (*first_invalid_escape == NULL) {
+            if (*first_invalid_escape_char == -1) {
-                *first_invalid_escape = s-1; /* Back up one char, since we've
+                *first_invalid_escape_char = c;
-                                                already incremented s. */
+                if (starts == initial_starts) {
                    /* Back up one char, since we've already incremented s. */
                    *first_invalid_escape_ptr = s - 1;
                }
            }
            WRITE_ASCII_CHAR('\\');
            WRITE_CHAR(c);
@ -6515,22 +6521,39 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
    return NULL;
 }
 // Export for binary compatibility.
 PyObject *
 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                               Py_ssize_t size,
                               const char *errors,
                               Py_ssize_t *consumed,
                               const char **first_invalid_escape)
 {
    int first_invalid_escape_char;
    return _PyUnicode_DecodeUnicodeEscapeInternal2(
            s, size, errors, consumed,
            &first_invalid_escape_char,
            first_invalid_escape);
 }
 PyObject *
 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
                              Py_ssize_t size,
                              const char *errors,
                              Py_ssize_t *consumed)
 {
-    const char *first_invalid_escape;
+    int first_invalid_escape_char;
-    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+    const char *first_invalid_escape_ptr;
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
                                                      consumed,
-                                                      &first_invalid_escape);
+                                                      &first_invalid_escape_char,
                                                      &first_invalid_escape_ptr);
    if (result == NULL)
        return NULL;
-    if (first_invalid_escape != NULL) {
+    if (first_invalid_escape_char != -1) {
        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
                             "invalid escape sequence '\\%c'",
-                             (unsigned char)*first_invalid_escape) < 0) {
+                             first_invalid_escape_char) < 0) {
            Py_DECREF(result);
            return NULL;
        }
--- a/Parser/pegen/parse_string.c
+++ b/Parser/pegen/parse_string.c
@ -119,12 +119,15 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
    len = p - buf;
    s = buf;
-    const char *first_invalid_escape;
+    int first_invalid_escape_char;
-    v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
+    const char *first_invalid_escape_ptr;
    v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
                                                &first_invalid_escape_char,
                                                &first_invalid_escape_ptr);
-    if (v != NULL && first_invalid_escape != NULL) {
+    if (v != NULL && first_invalid_escape_ptr != NULL) {
-        if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(parser, *first_invalid_escape_ptr, t) < 0) {
-            /* We have not decref u before because first_invalid_escape points
+            /* We have not decref u before because first_invalid_escape_ptr points
               inside u. */
            Py_XDECREF(u);
            Py_DECREF(v);
@ -138,14 +141,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
 static PyObject *
 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
 {
-    const char *first_invalid_escape;
+    int first_invalid_escape_char;
-    PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
+    const char *first_invalid_escape_ptr;
    PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
                                              &first_invalid_escape_char,
                                              &first_invalid_escape_ptr);
    if (result == NULL) {
        return NULL;
    }
-    if (first_invalid_escape != NULL) {
+    if (first_invalid_escape_ptr != NULL) {
-        if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(p, *first_invalid_escape_ptr, t) < 0) {
            Py_DECREF(result);
            return NULL;
        }
		`@ -0,0 +1,2 @@`
							`Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error`
							`handler.`