gh-126004: Fix positions handling in codecs.xmlcharrefreplace_errors (#127675)

This fixes how `PyCodec_XMLCharRefReplaceErrors` handles the `start` and `end` attributes of `UnicodeError` objects via the `_PyUnicodeError_GetParams` helper.
2025-09-26 18:29:57 +00:00 · 2025-01-23 11:42:38 +01:00 · 2025-01-23 11:42:38 +01:00 · 70dcc847df
commit 70dcc847df
parent a10f99375e
3 changed files with 111 additions and 94 deletions
--- a/Lib/test/test_capi/test_codecs.py
+++ b/Lib/test/test_capi/test_codecs.py
@ -843,7 +843,8 @@ class CAPICodecErrors(unittest.TestCase):
    def test_codec_xmlcharrefreplace_errors_handler(self):
        handler = _testcapi.codec_xmlcharrefreplace_errors
-        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
+        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors,
                                          safe=True)
    def test_codec_backslashreplace_errors_handler(self):
        handler = _testcapi.codec_backslashreplace_errors
@ -853,12 +854,12 @@ class CAPICodecErrors(unittest.TestCase):
        handler = _testlimitedcapi.codec_namereplace_errors
        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
-    def do_test_codec_errors_handler(self, handler, exceptions):
+    def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
        at_least_one = False
        for exc in exceptions:
            # See https://github.com/python/cpython/issues/123378 and related
            # discussion and issues for details.
-            if self._exception_may_crash(exc):
+            if not safe and self._exception_may_crash(exc):
                continue
            at_least_one = True
--- a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst
@ -0,0 +1,3 @@
 Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
 values in the :func:`codecs.xmlcharrefreplace_errors` error handler.
 Patch by Bénédikt Tran.
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -755,56 +755,70 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 {
-    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
+    if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
-        PyObject *restuple;
+        wrong_exception_type(exc);
        PyObject *object;
        Py_ssize_t i;
        Py_ssize_t start;
        Py_ssize_t end;
        PyObject *res;
        Py_UCS1 *outp;
        Py_ssize_t ressize;
        Py_UCS4 ch;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
        return NULL;
-        if (PyUnicodeEncodeError_GetEnd(exc, &end))
+    }
    PyObject *obj;
    Py_ssize_t objlen, start, end, slen;
    if (_PyUnicodeError_GetParams(exc,
                                  &obj, &objlen,
                                  &start, &end, &slen, false) < 0)
    {
        return NULL;
-        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+    }
-            return NULL;
+
-        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
+    // The number of characters that each character 'ch' contributes
    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
    // range is below 10^7, each "block" requires at most 2 + 7 + 1
    // characters.
    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
-        for (i = start, ressize = 0; i < end; ++i) {
+        end = Py_MIN(end, objlen);
        slen = Py_MAX(0, end - start);
    }
    Py_ssize_t ressize = 0;
    for (Py_ssize_t i = start; i < end; ++i) {
        /* object is guaranteed to be "ready" */
-            ch = PyUnicode_READ_CHAR(object, i);
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
-            if (ch<10)
+        if (ch < 10) {
            ressize += 2 + 1 + 1;
-            else if (ch<100)
+        }
        else if (ch < 100) {
            ressize += 2 + 2 + 1;
-            else if (ch<1000)
+        }
        else if (ch < 1000) {
            ressize += 2 + 3 + 1;
-            else if (ch<10000)
+        }
        else if (ch < 10000) {
            ressize += 2 + 4 + 1;
-            else if (ch<100000)
+        }
        else if (ch < 100000) {
            ressize += 2 + 5 + 1;
-            else if (ch<1000000)
+        }
        else if (ch < 1000000) {
            ressize += 2 + 6 + 1;
-            else
+        }
        else {
            assert(ch < 10000000);
            ressize += 2 + 7 + 1;
        }
    }
    /* allocate replacement */
-        res = PyUnicode_New(ressize, 127);
+    PyObject *res = PyUnicode_New(ressize, 127);
    if (res == NULL) {
-            Py_DECREF(object);
+        Py_DECREF(obj);
        return NULL;
    }
-        outp = PyUnicode_1BYTE_DATA(res);
+    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
    /* generate replacement */
-        for (i = start; i < end; ++i) {
+    for (Py_ssize_t i = start; i < end; ++i) {
-            int digits;
+        int digits, base;
-            int base;
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
            ch = PyUnicode_READ_CHAR(object, i);
            *outp++ = '&';
            *outp++ = '#';
        if (ch < 10) {
            digits = 1;
            base = 1;
@ -830,10 +844,14 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
            base = 100000;
        }
        else {
            assert(ch < 10000000);
            digits = 7;
            base = 1000000;
        }
        *outp++ = '&';
        *outp++ = '#';
        while (digits-- > 0) {
            assert(base >= 1);
            *outp++ = '0' + ch / base;
            ch %= base;
            base /= 10;
@ -841,15 +859,10 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
        *outp++ = ';';
    }
    assert(_PyUnicode_CheckConsistency(res, 1));
-        restuple = Py_BuildValue("(Nn)", res, end);
+    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
-        Py_DECREF(object);
+    Py_DECREF(obj);
    return restuple;
 }
    else {
        wrong_exception_type(exc);
        return NULL;
    }
 }
 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
 {