gh-129173: simplify PyCodec_XMLCharRefReplaceErrors logic (#129894)

Writing the decimal representation of a Unicode codepoint only requires to know the number of digits.

---------

Co-authored-by: Petr Viktorin <encukou@gmail.com>
This commit is contained in:
Bénédikt Tran 2025-03-03 12:43:22 +01:00 committed by GitHub
parent efbc5929ca
commit f693f84227
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
} }
/*
* Determine the number of digits for a decimal representation of Unicode
* codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
*/
static inline int
n_decimal_digits_for_codepoint(Py_UCS4 ch)
{
if (ch < 10) return 1;
if (ch < 100) return 2;
if (ch < 1000) return 3;
if (ch < 10000) return 4;
if (ch < 100000) return 5;
if (ch < 1000000) return 6;
if (ch < 10000000) return 7;
// Unicode codepoints are limited to 1114111 (7 decimal digits)
Py_UNREACHABLE();
}
/* /*
* Create a Unicode string containing 'count' copies of the official * Create a Unicode string containing 'count' copies of the official
* Unicode REPLACEMENT CHARACTER (0xFFFD). * Unicode REPLACEMENT CHARACTER (0xFFFD).
@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
} }
} }
// --- handler: 'xmlcharrefreplace' -------------------------------------------
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
{ {
if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { if (!_PyIsUnicodeEncodeError(exc)) {
wrong_exception_type(exc); wrong_exception_type(exc);
return NULL; return NULL;
} }
@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
Py_ssize_t ressize = 0; Py_ssize_t ressize = 0;
for (Py_ssize_t i = start; i < end; ++i) { for (Py_ssize_t i = start; i < end; ++i) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (ch < 10) { int k = n_decimal_digits_for_codepoint(ch);
ressize += 2 + 1 + 1; assert(k != 0);
} assert(k <= 7);
else if (ch < 100) { ressize += 2 + k + 1;
ressize += 2 + 2 + 1;
}
else if (ch < 1000) {
ressize += 2 + 3 + 1;
}
else if (ch < 10000) {
ressize += 2 + 4 + 1;
}
else if (ch < 100000) {
ressize += 2 + 5 + 1;
}
else if (ch < 1000000) {
ressize += 2 + 6 + 1;
}
else {
assert(ch < 10000000);
ressize += 2 + 7 + 1;
}
} }
/* allocate replacement */ /* allocate replacement */
@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */ /* generate replacement */
for (Py_ssize_t i = start; i < end; ++i) { for (Py_ssize_t i = start; i < end; ++i) {
int digits, base;
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (ch < 10) { /*
digits = 1; * Write the decimal representation of 'ch' to the buffer pointed by 'p'
base = 1; * using at most 7 characters prefixed by '&#' and suffixed by ';'.
} */
else if (ch < 100) {
digits = 2;
base = 10;
}
else if (ch < 1000) {
digits = 3;
base = 100;
}
else if (ch < 10000) {
digits = 4;
base = 1000;
}
else if (ch < 100000) {
digits = 5;
base = 10000;
}
else if (ch < 1000000) {
digits = 6;
base = 100000;
}
else {
assert(ch < 10000000);
digits = 7;
base = 1000000;
}
*outp++ = '&'; *outp++ = '&';
*outp++ = '#'; *outp++ = '#';
while (digits-- > 0) { Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
assert(base >= 1); for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
*outp++ = '0' + ch / base; *p_digit = '0' + (ch % 10);
ch %= base; ch /= 10;
base /= 10;
} }
assert(ch == 0);
outp = digit_end;
*outp++ = ';'; *outp++ = ';';
} }
assert(_PyUnicode_CheckConsistency(res, 1)); assert(_PyUnicode_CheckConsistency(res, 1));
@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
} }
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) static inline PyObject *
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{ {
return PyCodec_XMLCharRefReplaceErrors(exc); return PyCodec_XMLCharRefReplaceErrors(exc);
} }