mirror of
https://github.com/python/cpython.git
synced 2025-10-09 16:34:44 +00:00
Optimize error handlers of ASCII and Latin1 encoders when the replacement
string is pure ASCII: use _PyBytesWriter_WriteBytes(), don't check individual character. Cleanup unicode_encode_ucs1(): * Rename repunicode to rep * Clear rep object on error * Factorize code between bytes and unicode path
This commit is contained in:
parent
ce179bf6ba
commit
6bd525b656
2 changed files with 48 additions and 44 deletions
|
@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
||||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
||||||
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
Py_ssize_t startpos, endpos, newpos;
|
Py_ssize_t startpos, endpos, newpos;
|
||||||
Py_ssize_t repsize, k;
|
Py_ssize_t k;
|
||||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||||
error_handler = get_error_handler(errors);
|
error_handler = get_error_handler(errors);
|
||||||
|
|
||||||
|
@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
||||||
p = _PyBytesWriter_WriteBytes(&writer, p,
|
p = _PyBytesWriter_WriteBytes(&writer, p,
|
||||||
PyBytes_AS_STRING(rep),
|
PyBytes_AS_STRING(rep),
|
||||||
PyBytes_GET_SIZE(rep));
|
PyBytes_GET_SIZE(rep));
|
||||||
if (p == NULL)
|
|
||||||
goto error;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* rep is unicode */
|
/* rep is unicode */
|
||||||
if (PyUnicode_READY(rep) < 0)
|
if (PyUnicode_READY(rep) < 0)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
repsize = PyUnicode_GET_LENGTH(rep);
|
|
||||||
|
|
||||||
p = _PyBytesWriter_Prepare(&writer, p, repsize);
|
|
||||||
if (p == NULL)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
if (!PyUnicode_IS_ASCII(rep)) {
|
if (!PyUnicode_IS_ASCII(rep)) {
|
||||||
raise_encode_exception(&exc, "utf-8",
|
raise_encode_exception(&exc, "utf-8",
|
||||||
unicode,
|
unicode,
|
||||||
|
@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||||
memcpy(p, PyUnicode_DATA(rep), repsize);
|
p = _PyBytesWriter_WriteBytes(&writer, p,
|
||||||
p += repsize;
|
PyUnicode_DATA(rep),
|
||||||
|
PyUnicode_GET_LENGTH(rep));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
Py_CLEAR(rep);
|
Py_CLEAR(rep);
|
||||||
|
|
||||||
i = newpos;
|
i = newpos;
|
||||||
|
|
|
@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode,
|
||||||
PyObject *error_handler_obj = NULL;
|
PyObject *error_handler_obj = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
|
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
|
||||||
|
PyObject *rep = NULL;
|
||||||
/* output object */
|
/* output object */
|
||||||
_PyBytesWriter writer;
|
_PyBytesWriter writer;
|
||||||
|
|
||||||
|
@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode,
|
||||||
++pos;
|
++pos;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PyObject *repunicode;
|
Py_ssize_t newpos, i;
|
||||||
Py_ssize_t repsize, newpos, i;
|
|
||||||
/* startpos for collecting unencodable chars */
|
/* startpos for collecting unencodable chars */
|
||||||
Py_ssize_t collstart = pos;
|
Py_ssize_t collstart = pos;
|
||||||
Py_ssize_t collend = collstart + 1;
|
Py_ssize_t collend = collstart + 1;
|
||||||
|
@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode,
|
||||||
/* fallback to general error handling */
|
/* fallback to general error handling */
|
||||||
|
|
||||||
default:
|
default:
|
||||||
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
|
rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
|
||||||
encoding, reason, unicode, &exc,
|
encoding, reason, unicode, &exc,
|
||||||
collstart, collend, &newpos);
|
collstart, collend, &newpos);
|
||||||
if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
|
if (rep == NULL)
|
||||||
PyUnicode_READY(repunicode) == -1))
|
|
||||||
goto onError;
|
goto onError;
|
||||||
|
|
||||||
/* substract preallocated bytes */
|
/* substract preallocated bytes */
|
||||||
writer.min_size -= 1;
|
writer.min_size -= 1;
|
||||||
|
|
||||||
if (PyBytes_Check(repunicode)) {
|
if (PyBytes_Check(rep)) {
|
||||||
/* Directly copy bytes result to output. */
|
/* Directly copy bytes result to output. */
|
||||||
str = _PyBytesWriter_WriteBytes(&writer, str,
|
str = _PyBytesWriter_WriteBytes(&writer, str,
|
||||||
PyBytes_AS_STRING(repunicode),
|
PyBytes_AS_STRING(rep),
|
||||||
PyBytes_GET_SIZE(repunicode));
|
PyBytes_GET_SIZE(rep));
|
||||||
if (str == NULL)
|
if (str == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
|
|
||||||
pos = newpos;
|
|
||||||
Py_DECREF(repunicode);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
assert(PyUnicode_Check(rep));
|
||||||
|
|
||||||
/* need more space? (at least enough for what we
|
if (PyUnicode_READY(rep) < 0)
|
||||||
have+the replacement+the rest of the string, so
|
goto onError;
|
||||||
we won't have to check space for encodable characters) */
|
|
||||||
repsize = PyUnicode_GET_LENGTH(repunicode);
|
if (PyUnicode_IS_ASCII(rep)) {
|
||||||
|
/* Fast path: all characters are smaller than limit */
|
||||||
|
assert(limit >= 128);
|
||||||
|
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||||
|
str = _PyBytesWriter_WriteBytes(&writer, str,
|
||||||
|
PyUnicode_DATA(rep),
|
||||||
|
PyUnicode_GET_LENGTH(rep));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
|
||||||
|
|
||||||
str = _PyBytesWriter_Prepare(&writer, str, repsize);
|
str = _PyBytesWriter_Prepare(&writer, str, repsize);
|
||||||
if (str == NULL)
|
if (str == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
|
|
||||||
/* check if there is anything unencodable in the replacement
|
/* check if there is anything unencodable in the
|
||||||
and copy it to the output */
|
replacement and copy it to the output */
|
||||||
for (i = 0; repsize-->0; ++i, ++str) {
|
for (i = 0; repsize-->0; ++i, ++str) {
|
||||||
ch = PyUnicode_READ_CHAR(repunicode, i);
|
ch = PyUnicode_READ_CHAR(rep, i);
|
||||||
if (ch >= limit) {
|
if (ch >= limit) {
|
||||||
raise_encode_exception(&exc, encoding, unicode,
|
raise_encode_exception(&exc, encoding, unicode,
|
||||||
pos, pos+1, reason);
|
pos, pos+1, reason);
|
||||||
Py_DECREF(repunicode);
|
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
*str = (char)ch;
|
*str = (char)ch;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
pos = newpos;
|
pos = newpos;
|
||||||
Py_DECREF(repunicode);
|
Py_CLEAR(rep);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If overallocation was disabled, ensure that it was the last
|
/* If overallocation was disabled, ensure that it was the last
|
||||||
|
@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode,
|
||||||
return _PyBytesWriter_Finish(&writer, str);
|
return _PyBytesWriter_Finish(&writer, str);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
Py_XDECREF(rep);
|
||||||
_PyBytesWriter_Dealloc(&writer);
|
_PyBytesWriter_Dealloc(&writer);
|
||||||
Py_XDECREF(error_handler_obj);
|
Py_XDECREF(error_handler_obj);
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue