mirror of
https://github.com/python/cpython.git
synced 2025-08-22 17:55:18 +00:00
gh-129173: simplify PyCodec_XMLCharRefReplaceErrors
logic (#129894)
Writing the decimal representation of a Unicode codepoint only requires to know the number of digits. --------- Co-authored-by: Petr Viktorin <encukou@gmail.com>
This commit is contained in:
parent
efbc5929ca
commit
f693f84227
1 changed files with 39 additions and 60 deletions
|
@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine the number of digits for a decimal representation of Unicode
|
||||||
|
* codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
|
||||||
|
*/
|
||||||
|
static inline int
|
||||||
|
n_decimal_digits_for_codepoint(Py_UCS4 ch)
|
||||||
|
{
|
||||||
|
if (ch < 10) return 1;
|
||||||
|
if (ch < 100) return 2;
|
||||||
|
if (ch < 1000) return 3;
|
||||||
|
if (ch < 10000) return 4;
|
||||||
|
if (ch < 100000) return 5;
|
||||||
|
if (ch < 1000000) return 6;
|
||||||
|
if (ch < 10000000) return 7;
|
||||||
|
// Unicode codepoints are limited to 1114111 (7 decimal digits)
|
||||||
|
Py_UNREACHABLE();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create a Unicode string containing 'count' copies of the official
|
* Create a Unicode string containing 'count' copies of the official
|
||||||
* Unicode REPLACEMENT CHARACTER (0xFFFD).
|
* Unicode REPLACEMENT CHARACTER (0xFFFD).
|
||||||
|
@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// --- handler: 'xmlcharrefreplace' -------------------------------------------
|
||||||
|
|
||||||
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
|
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
|
||||||
{
|
{
|
||||||
if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
|
if (!_PyIsUnicodeEncodeError(exc)) {
|
||||||
wrong_exception_type(exc);
|
wrong_exception_type(exc);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
|
||||||
|
|
||||||
Py_ssize_t ressize = 0;
|
Py_ssize_t ressize = 0;
|
||||||
for (Py_ssize_t i = start; i < end; ++i) {
|
for (Py_ssize_t i = start; i < end; ++i) {
|
||||||
/* object is guaranteed to be "ready" */
|
|
||||||
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
|
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
|
||||||
if (ch < 10) {
|
int k = n_decimal_digits_for_codepoint(ch);
|
||||||
ressize += 2 + 1 + 1;
|
assert(k != 0);
|
||||||
}
|
assert(k <= 7);
|
||||||
else if (ch < 100) {
|
ressize += 2 + k + 1;
|
||||||
ressize += 2 + 2 + 1;
|
|
||||||
}
|
|
||||||
else if (ch < 1000) {
|
|
||||||
ressize += 2 + 3 + 1;
|
|
||||||
}
|
|
||||||
else if (ch < 10000) {
|
|
||||||
ressize += 2 + 4 + 1;
|
|
||||||
}
|
|
||||||
else if (ch < 100000) {
|
|
||||||
ressize += 2 + 5 + 1;
|
|
||||||
}
|
|
||||||
else if (ch < 1000000) {
|
|
||||||
ressize += 2 + 6 + 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert(ch < 10000000);
|
|
||||||
ressize += 2 + 7 + 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* allocate replacement */
|
/* allocate replacement */
|
||||||
|
@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
|
||||||
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
|
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
|
||||||
/* generate replacement */
|
/* generate replacement */
|
||||||
for (Py_ssize_t i = start; i < end; ++i) {
|
for (Py_ssize_t i = start; i < end; ++i) {
|
||||||
int digits, base;
|
|
||||||
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
|
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
|
||||||
if (ch < 10) {
|
/*
|
||||||
digits = 1;
|
* Write the decimal representation of 'ch' to the buffer pointed by 'p'
|
||||||
base = 1;
|
* using at most 7 characters prefixed by '&#' and suffixed by ';'.
|
||||||
}
|
*/
|
||||||
else if (ch < 100) {
|
|
||||||
digits = 2;
|
|
||||||
base = 10;
|
|
||||||
}
|
|
||||||
else if (ch < 1000) {
|
|
||||||
digits = 3;
|
|
||||||
base = 100;
|
|
||||||
}
|
|
||||||
else if (ch < 10000) {
|
|
||||||
digits = 4;
|
|
||||||
base = 1000;
|
|
||||||
}
|
|
||||||
else if (ch < 100000) {
|
|
||||||
digits = 5;
|
|
||||||
base = 10000;
|
|
||||||
}
|
|
||||||
else if (ch < 1000000) {
|
|
||||||
digits = 6;
|
|
||||||
base = 100000;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert(ch < 10000000);
|
|
||||||
digits = 7;
|
|
||||||
base = 1000000;
|
|
||||||
}
|
|
||||||
*outp++ = '&';
|
*outp++ = '&';
|
||||||
*outp++ = '#';
|
*outp++ = '#';
|
||||||
while (digits-- > 0) {
|
Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
|
||||||
assert(base >= 1);
|
for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
|
||||||
*outp++ = '0' + ch / base;
|
*p_digit = '0' + (ch % 10);
|
||||||
ch %= base;
|
ch /= 10;
|
||||||
base /= 10;
|
|
||||||
}
|
}
|
||||||
|
assert(ch == 0);
|
||||||
|
outp = digit_end;
|
||||||
*outp++ = ';';
|
*outp++ = ';';
|
||||||
}
|
}
|
||||||
assert(_PyUnicode_CheckConsistency(res, 1));
|
assert(_PyUnicode_CheckConsistency(res, 1));
|
||||||
|
@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
|
static inline PyObject *
|
||||||
|
xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
|
||||||
{
|
{
|
||||||
return PyCodec_XMLCharRefReplaceErrors(exc);
|
return PyCodec_XMLCharRefReplaceErrors(exc);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue