gh-126004: Fix positions handling in codecs.xmlcharrefreplace_errors (#127675)

This fixes how `PyCodec_XMLCharRefReplaceErrors` handles the `start` and `end` attributes of `UnicodeError` objects via the `_PyUnicodeError_GetParams` helper.
2025-07-07 19:35:27 +00:00 · 2025-01-23 11:42:38 +01:00 · 2025-01-23 11:42:38 +01:00 · 70dcc847df
commit 70dcc847df
parent a10f99375e
3 changed files with 111 additions and 94 deletions
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -755,100 +755,113 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)

 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 {
-    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
-        PyObject *restuple;
-        PyObject *object;
-        Py_ssize_t i;
-        Py_ssize_t start;
-        Py_ssize_t end;
-        PyObject *res;
-        Py_UCS1 *outp;
-        Py_ssize_t ressize;
-        Py_UCS4 ch;
-        if (PyUnicodeEncodeError_GetStart(exc, &start))
-            return NULL;
-        if (PyUnicodeEncodeError_GetEnd(exc, &end))
-            return NULL;
-        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
-            return NULL;
-        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
-            end = start + PY_SSIZE_T_MAX / (2+7+1);
-        for (i = start, ressize = 0; i < end; ++i) {
-            /* object is guaranteed to be "ready" */
-            ch = PyUnicode_READ_CHAR(object, i);
-            if (ch<10)
-                ressize += 2+1+1;
-            else if (ch<100)
-                ressize += 2+2+1;
-            else if (ch<1000)
-                ressize += 2+3+1;
-            else if (ch<10000)
-                ressize += 2+4+1;
-            else if (ch<100000)
-                ressize += 2+5+1;
-            else if (ch<1000000)
-                ressize += 2+6+1;
-            else
-                ressize += 2+7+1;
-        }
-        /* allocate replacement */
-        res = PyUnicode_New(ressize, 127);
-        if (res == NULL) {
-            Py_DECREF(object);
-            return NULL;
-        }
-        outp = PyUnicode_1BYTE_DATA(res);
-        /* generate replacement */
-        for (i = start; i < end; ++i) {
-            int digits;
-            int base;
-            ch = PyUnicode_READ_CHAR(object, i);
-            *outp++ = '&';
-            *outp++ = '#';
-            if (ch<10) {
-                digits = 1;
-                base = 1;
-            }
-            else if (ch<100) {
-                digits = 2;
-                base = 10;
-            }
-            else if (ch<1000) {
-                digits = 3;
-                base = 100;
-            }
-            else if (ch<10000) {
-                digits = 4;
-                base = 1000;
-            }
-            else if (ch<100000) {
-                digits = 5;
-                base = 10000;
-            }
-            else if (ch<1000000) {
-                digits = 6;
-                base = 100000;
-            }
-            else {
-                digits = 7;
-                base = 1000000;
-            }
-            while (digits-->0) {
-                *outp++ = '0' + ch/base;
-                ch %= base;
-                base /= 10;
-            }
-            *outp++ = ';';
-        }
-        assert(_PyUnicode_CheckConsistency(res, 1));
-        restuple = Py_BuildValue("(Nn)", res, end);
-        Py_DECREF(object);
-        return restuple;
-    }
-    else {
+    if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
        wrong_exception_type(exc);
        return NULL;
    }
+
+    PyObject *obj;
+    Py_ssize_t objlen, start, end, slen;
+    if (_PyUnicodeError_GetParams(exc,
+                                  &obj, &objlen,
+                                  &start, &end, &slen, false) < 0)
+    {
+        return NULL;
+    }
+
+    // The number of characters that each character 'ch' contributes
+    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
+    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
+    // range is below 10^7, each "block" requires at most 2 + 7 + 1
+    // characters.
+    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
+        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
+        end = Py_MIN(end, objlen);
+        slen = Py_MAX(0, end - start);
+    }
+
+    Py_ssize_t ressize = 0;
+    for (Py_ssize_t i = start; i < end; ++i) {
+        /* object is guaranteed to be "ready" */
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
+        if (ch < 10) {
+            ressize += 2 + 1 + 1;
+        }
+        else if (ch < 100) {
+            ressize += 2 + 2 + 1;
+        }
+        else if (ch < 1000) {
+            ressize += 2 + 3 + 1;
+        }
+        else if (ch < 10000) {
+            ressize += 2 + 4 + 1;
+        }
+        else if (ch < 100000) {
+            ressize += 2 + 5 + 1;
+        }
+        else if (ch < 1000000) {
+            ressize += 2 + 6 + 1;
+        }
+        else {
+            assert(ch < 10000000);
+            ressize += 2 + 7 + 1;
+        }
+    }
+
+    /* allocate replacement */
+    PyObject *res = PyUnicode_New(ressize, 127);
+    if (res == NULL) {
+        Py_DECREF(obj);
+        return NULL;
+    }
+    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
+    /* generate replacement */
+    for (Py_ssize_t i = start; i < end; ++i) {
+        int digits, base;
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
+        if (ch < 10) {
+            digits = 1;
+            base = 1;
+        }
+        else if (ch < 100) {
+            digits = 2;
+            base = 10;
+        }
+        else if (ch < 1000) {
+            digits = 3;
+            base = 100;
+        }
+        else if (ch < 10000) {
+            digits = 4;
+            base = 1000;
+        }
+        else if (ch < 100000) {
+            digits = 5;
+            base = 10000;
+        }
+        else if (ch < 1000000) {
+            digits = 6;
+            base = 100000;
+        }
+        else {
+            assert(ch < 10000000);
+            digits = 7;
+            base = 1000000;
+        }
+        *outp++ = '&';
+        *outp++ = '#';
+        while (digits-- > 0) {
+            assert(base >= 1);
+            *outp++ = '0' + ch / base;
+            ch %= base;
+            base /= 10;
+        }
+        *outp++ = ';';
+    }
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
+    Py_DECREF(obj);
+    return restuple;
 }

 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)