[3.13] gh-123378: fix a crash in UnicodeError.__str__ (GH-124935) (#125099)

gh-123378: fix a crash in `UnicodeError.__str__` (GH-124935) (cherry picked from commit ba14dfafd9) Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
2025-07-23 11:15:24 +00:00 · 2024-10-08 14:06:57 +02:00 · 2024-10-08 14:06:57 +02:00 · 84991153da
commit 84991153da
parent 4eab6e8d29
3 changed files with 93 additions and 45 deletions
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@ -8,6 +8,7 @@ import pickle
 import weakref
 import errno
 from codecs import BOM_UTF8
 from itertools import product
 from textwrap import dedent
 from test.support import (captured_stderr, check_impl_detail,
@ -1336,6 +1337,29 @@ class ExceptionTests(unittest.TestCase):
        for klass in klasses:
            self.assertEqual(str(klass.__new__(klass)), "")
    def test_unicode_error_str_does_not_crash(self):
        # Test that str(UnicodeError(...)) does not crash.
        # See https://github.com/python/cpython/issues/123378.
        for start, end, objlen in product(
            range(-5, 5),
            range(-5, 5),
            range(7),
        ):
            obj = 'a' * objlen
            with self.subTest('encode', objlen=objlen, start=start, end=end):
                exc = UnicodeEncodeError('utf-8', obj, start, end, '')
                self.assertIsInstance(str(exc), str)
            with self.subTest('translate', objlen=objlen, start=start, end=end):
                exc = UnicodeTranslateError(obj, start, end, '')
                self.assertIsInstance(str(exc), str)
            encoded = obj.encode()
            with self.subTest('decode', objlen=objlen, start=start, end=end):
                exc = UnicodeDecodeError('utf-8', encoded, start, end, '')
                self.assertIsInstance(str(exc), str)
    @no_tracing
    def test_badisinstance(self):
        # Bug #2542: if issubclass(e, MyException) raises an exception,
--- a/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst
@ -0,0 +1,3 @@
 Fix a crash in the :meth:`~object.__str__` method of :exc:`UnicodeError`
 objects when the :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
 values are invalid or out-of-range. Patch by Bénédikt Tran.
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@ -2959,46 +2959,55 @@ UnicodeEncodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 UnicodeEncodeError_str(PyObject *self)
 {
-    PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
+    PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
    PyObject *result = NULL;
    PyObject *reason_str = NULL;
    PyObject *encoding_str = NULL;
-    if (!uself->object)
+    if (exc->object == NULL) {
        /* Not properly initialized. */
        return PyUnicode_FromString("");
    }
    /* Get reason and encoding as strings, which they might not be if
       they've been modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
-    if (reason_str == NULL)
+    if (reason_str == NULL) {
        goto done;
-    encoding_str = PyObject_Str(uself->encoding);
+    }
-    if (encoding_str == NULL)
+    encoding_str = PyObject_Str(exc->encoding);
    if (encoding_str == NULL) {
        goto done;
    }
-    if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) {
+    Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
-        Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
+    Py_ssize_t start = exc->start, end = exc->end;
    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
        Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
        const char *fmt;
-        if (badchar <= 0xff)
+        if (badchar <= 0xff) {
            fmt = "'%U' codec can't encode character '\\x%02x' in position %zd: %U";
-        else if (badchar <= 0xffff)
+        }
        else if (badchar <= 0xffff) {
            fmt = "'%U' codec can't encode character '\\u%04x' in position %zd: %U";
-        else
+        }
        else {
            fmt = "'%U' codec can't encode character '\\U%08x' in position %zd: %U";
        }
        result = PyUnicode_FromFormat(
            fmt,
            encoding_str,
            (int)badchar,
-            uself->start,
+            start,
            reason_str);
    }
    else {
        result = PyUnicode_FromFormat(
            "'%U' codec can't encode characters in position %zd-%zd: %U",
            encoding_str,
-            uself->start,
+            start,
-            uself->end-1,
+            end - 1,
            reason_str);
    }
 done:
@ -3072,41 +3081,46 @@ error:
 static PyObject *
 UnicodeDecodeError_str(PyObject *self)
 {
-    PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
+    PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
    PyObject *result = NULL;
    PyObject *reason_str = NULL;
    PyObject *encoding_str = NULL;
-    if (!uself->object)
+    if (exc->object == NULL) {
        /* Not properly initialized. */
        return PyUnicode_FromString("");
    }
    /* Get reason and encoding as strings, which they might not be if
       they've been modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
-    if (reason_str == NULL)
+    if (reason_str == NULL) {
        goto done;
-    encoding_str = PyObject_Str(uself->encoding);
+    }
-    if (encoding_str == NULL)
+    encoding_str = PyObject_Str(exc->encoding);
    if (encoding_str == NULL) {
        goto done;
    }
-    if (uself->start < PyBytes_GET_SIZE(uself->object) && uself->end == uself->start+1) {
+    Py_ssize_t len = PyBytes_GET_SIZE(exc->object);
-        int byte = (int)(PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[uself->start]&0xff);
+    Py_ssize_t start = exc->start, end = exc->end;
    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
        int badbyte = (int)(PyBytes_AS_STRING(exc->object)[start] & 0xff);
        result = PyUnicode_FromFormat(
            "'%U' codec can't decode byte 0x%02x in position %zd: %U",
            encoding_str,
-            byte,
+            badbyte,
-            uself->start,
+            start,
            reason_str);
    }
    else {
        result = PyUnicode_FromFormat(
            "'%U' codec can't decode bytes in position %zd-%zd: %U",
            encoding_str,
-            uself->start,
+            start,
-            uself->end-1,
+            end - 1,
-            reason_str
+            reason_str);
            );
    }
 done:
    Py_XDECREF(reason_str);
@ -3169,42 +3183,49 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args,
 static PyObject *
 UnicodeTranslateError_str(PyObject *self)
 {
-    PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self;
+    PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self;
    PyObject *result = NULL;
    PyObject *reason_str = NULL;
-    if (!uself->object)
+    if (exc->object == NULL) {
        /* Not properly initialized. */
        return PyUnicode_FromString("");
    }
    /* Get reason as a string, which it might not be if it's been
       modified after we were constructed. */
-    reason_str = PyObject_Str(uself->reason);
+    reason_str = PyObject_Str(exc->reason);
-    if (reason_str == NULL)
+    if (reason_str == NULL) {
        goto done;
    }
-    if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) {
+    Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object);
-        Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start);
+    Py_ssize_t start = exc->start, end = exc->end;
    if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) {
        Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start);
        const char *fmt;
-        if (badchar <= 0xff)
+        if (badchar <= 0xff) {
            fmt = "can't translate character '\\x%02x' in position %zd: %U";
-        else if (badchar <= 0xffff)
+        }
        else if (badchar <= 0xffff) {
            fmt = "can't translate character '\\u%04x' in position %zd: %U";
-        else
+        }
        else {
            fmt = "can't translate character '\\U%08x' in position %zd: %U";
        }
        result = PyUnicode_FromFormat(
            fmt,
            (int)badchar,
-            uself->start,
+            start,
-            reason_str
+            reason_str);
-        );
+    }
-    } else {
+    else {
        result = PyUnicode_FromFormat(
            "can't translate characters in position %zd-%zd: %U",
-            uself->start,
+            start,
-            uself->end-1,
+            end - 1,
-            reason_str
+            reason_str);
            );
    }
 done:
    Py_XDECREF(reason_str);