gh-126004: Fix positions handling in codecs.xmlcharrefreplace_errors (#127675)

This fixes how `PyCodec_XMLCharRefReplaceErrors` handles the `start` and `end`
attributes of `UnicodeError` objects via the `_PyUnicodeError_GetParams` helper.
This commit is contained in:
Bénédikt Tran 2025-01-23 11:42:38 +01:00 committed by GitHub
parent a10f99375e
commit 70dcc847df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 111 additions and 94 deletions

View file

@ -843,7 +843,8 @@ class CAPICodecErrors(unittest.TestCase):
def test_codec_xmlcharrefreplace_errors_handler(self): def test_codec_xmlcharrefreplace_errors_handler(self):
handler = _testcapi.codec_xmlcharrefreplace_errors handler = _testcapi.codec_xmlcharrefreplace_errors
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) self.do_test_codec_errors_handler(handler, self.unicode_encode_errors,
safe=True)
def test_codec_backslashreplace_errors_handler(self): def test_codec_backslashreplace_errors_handler(self):
handler = _testcapi.codec_backslashreplace_errors handler = _testcapi.codec_backslashreplace_errors
@ -853,12 +854,12 @@ class CAPICodecErrors(unittest.TestCase):
handler = _testlimitedcapi.codec_namereplace_errors handler = _testlimitedcapi.codec_namereplace_errors
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
def do_test_codec_errors_handler(self, handler, exceptions): def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
at_least_one = False at_least_one = False
for exc in exceptions: for exc in exceptions:
# See https://github.com/python/cpython/issues/123378 and related # See https://github.com/python/cpython/issues/123378 and related
# discussion and issues for details. # discussion and issues for details.
if self._exception_may_crash(exc): if not safe and self._exception_may_crash(exc):
continue continue
at_least_one = True at_least_one = True

View file

@ -0,0 +1,3 @@
Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
values in the :func:`codecs.xmlcharrefreplace_errors` error handler.
Patch by Bénédikt Tran.

View file

@ -755,56 +755,70 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
{ {
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
PyObject *restuple; wrong_exception_type(exc);
PyObject *object;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
Py_UCS1 *outp;
Py_ssize_t ressize;
Py_UCS4 ch;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL; return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end)) }
PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, false) < 0)
{
return NULL; return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc))) }
return NULL;
if (end - start > PY_SSIZE_T_MAX / (2+7+1)) // The number of characters that each character 'ch' contributes
// in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
// and will be formatted as "&#" + DIGITS + ";". Since the Unicode
// range is below 10^7, each "block" requires at most 2 + 7 + 1
// characters.
if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
for (i = start, ressize = 0; i < end; ++i) { end = Py_MIN(end, objlen);
slen = Py_MAX(0, end - start);
}
Py_ssize_t ressize = 0;
for (Py_ssize_t i = start; i < end; ++i) {
/* object is guaranteed to be "ready" */ /* object is guaranteed to be "ready" */
ch = PyUnicode_READ_CHAR(object, i); Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (ch<10) if (ch < 10) {
ressize += 2 + 1 + 1; ressize += 2 + 1 + 1;
else if (ch<100) }
else if (ch < 100) {
ressize += 2 + 2 + 1; ressize += 2 + 2 + 1;
else if (ch<1000) }
else if (ch < 1000) {
ressize += 2 + 3 + 1; ressize += 2 + 3 + 1;
else if (ch<10000) }
else if (ch < 10000) {
ressize += 2 + 4 + 1; ressize += 2 + 4 + 1;
else if (ch<100000) }
else if (ch < 100000) {
ressize += 2 + 5 + 1; ressize += 2 + 5 + 1;
else if (ch<1000000) }
else if (ch < 1000000) {
ressize += 2 + 6 + 1; ressize += 2 + 6 + 1;
else }
else {
assert(ch < 10000000);
ressize += 2 + 7 + 1; ressize += 2 + 7 + 1;
} }
}
/* allocate replacement */ /* allocate replacement */
res = PyUnicode_New(ressize, 127); PyObject *res = PyUnicode_New(ressize, 127);
if (res == NULL) { if (res == NULL) {
Py_DECREF(object); Py_DECREF(obj);
return NULL; return NULL;
} }
outp = PyUnicode_1BYTE_DATA(res); Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */ /* generate replacement */
for (i = start; i < end; ++i) { for (Py_ssize_t i = start; i < end; ++i) {
int digits; int digits, base;
int base; Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
ch = PyUnicode_READ_CHAR(object, i);
*outp++ = '&';
*outp++ = '#';
if (ch < 10) { if (ch < 10) {
digits = 1; digits = 1;
base = 1; base = 1;
@ -830,10 +844,14 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
base = 100000; base = 100000;
} }
else { else {
assert(ch < 10000000);
digits = 7; digits = 7;
base = 1000000; base = 1000000;
} }
*outp++ = '&';
*outp++ = '#';
while (digits-- > 0) { while (digits-- > 0) {
assert(base >= 1);
*outp++ = '0' + ch / base; *outp++ = '0' + ch / base;
ch %= base; ch %= base;
base /= 10; base /= 10;
@ -841,15 +859,10 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
*outp++ = ';'; *outp++ = ';';
} }
assert(_PyUnicode_CheckConsistency(res, 1)); assert(_PyUnicode_CheckConsistency(res, 1));
restuple = Py_BuildValue("(Nn)", res, end); PyObject *restuple = Py_BuildValue("(Nn)", res, end);
Py_DECREF(object); Py_DECREF(obj);
return restuple; return restuple;
} }
else {
wrong_exception_type(exc);
return NULL;
}
}
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{ {