bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)

When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.
2025-10-12 01:43:12 +00:00 · 2018-01-31 20:48:05 +08:00 · 2018-01-31 20:48:05 +08:00 · 2c7fd46e11
commit 2c7fd46e11
parent 84521047e4
3 changed files with 74 additions and 2 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
    Py_ssize_t insize;
    Py_ssize_t newpos;
    Py_ssize_t replen;
+    Py_ssize_t remain;
    PyObject *inputobj = NULL;
+    int need_to_grow = 0;
+    const char *new_inptr;

    if (*errorHandler == NULL) {
        *errorHandler = PyCodec_LookupError(errors);
@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    if (!inputobj)
        goto onError;
+    remain = *inend - *input - *endinpos;
    *input = PyBytes_AS_STRING(inputobj);
    insize = PyBytes_GET_SIZE(inputobj);
    *inend = *input + insize;
@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
    replen = PyUnicode_GET_LENGTH(repunicode);
    if (replen > 1) {
        writer->min_length += replen - 1;
+        need_to_grow = 1;
+    }
+    new_inptr = *input + newpos;
+    if (*inend - new_inptr > remain) {
+        /* We don't know the decoding algorithm here so we make the worst
+           assumption that one byte decodes to one unicode character.
+           If unfortunately one byte could decode to more unicode characters,
+           the decoder may write out-of-bound then.  Is it possible for the
+           algorithms using this function? */
+        writer->min_length += *inend - new_inptr - remain;
+        need_to_grow = 1;
+    }
+    if (need_to_grow) {
        writer->overallocate = 1;
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
        goto onError;

    *endinpos = newpos;
-    *inptr = *input + newpos;
+    *inptr = new_inptr;

    /* we made it! */
    Py_DECREF(restuple);
@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
 #endif

    /* Note: size will always be longer than the resulting Unicode
-       character count */
+       character count normally.  Error handler will take care of
+       resizing when needed. */
    _PyUnicodeWriter_Init(&writer);
    writer.min_length = (e - q + 1) / 2;
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)