mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
handlers: ``ignore``, ``replace`` and ``surrogateescape``.
This commit is contained in:
parent
7dbe6dd963
commit
1d65d9192d
4 changed files with 57 additions and 9 deletions
|
@ -123,6 +123,9 @@ Optimizations
|
||||||
* The UTF-8 encoder is now up to 75 times as fast for error handlers:
|
* The UTF-8 encoder is now up to 75 times as fast for error handlers:
|
||||||
``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
|
``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
|
||||||
|
|
||||||
|
* The UTF-8 decoder is now up to 15 times as fast for error handlers:
|
||||||
|
``ignore``, ``replace`` and ``surrogateescape``.
|
||||||
|
|
||||||
|
|
||||||
Build and C API Changes
|
Build and C API Changes
|
||||||
=======================
|
=======================
|
||||||
|
|
|
@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase):
|
||||||
self.check_state_handling_decode(self.encoding,
|
self.check_state_handling_decode(self.encoding,
|
||||||
u, u.encode(self.encoding))
|
u, u.encode(self.encoding))
|
||||||
|
|
||||||
|
def test_decode_error(self):
|
||||||
|
for data, error_handler, expected in (
|
||||||
|
(b'[\x80\xff]', 'ignore', '[]'),
|
||||||
|
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
|
||||||
|
(b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
|
||||||
|
(b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
|
||||||
|
):
|
||||||
|
with self.subTest(data=data, error_handler=error_handler,
|
||||||
|
expected=expected):
|
||||||
|
self.assertEqual(data.decode(self.encoding, error_handler),
|
||||||
|
expected)
|
||||||
|
|
||||||
def test_lone_surrogates(self):
|
def test_lone_surrogates(self):
|
||||||
super().test_lone_surrogates()
|
super().test_lone_surrogates()
|
||||||
# not sure if this is making sense for
|
# not sure if this is making sense for
|
||||||
|
|
|
@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
|
||||||
|
handlers: ``ignore``, ``replace`` and ``surrogateescape``.
|
||||||
|
|
||||||
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
|
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
|
||||||
|
|
||||||
- Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
|
- Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
|
||||||
|
|
|
@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
Py_ssize_t startinpos;
|
Py_ssize_t startinpos;
|
||||||
Py_ssize_t endinpos;
|
Py_ssize_t endinpos;
|
||||||
const char *errmsg = "";
|
const char *errmsg = "";
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *error_handler_obj = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
|
||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
if (consumed)
|
if (consumed)
|
||||||
|
@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
while (s < end) {
|
while (s < end) {
|
||||||
Py_UCS4 ch;
|
Py_UCS4 ch;
|
||||||
int kind = writer.kind;
|
int kind = writer.kind;
|
||||||
|
|
||||||
if (kind == PyUnicode_1BYTE_KIND) {
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
if (PyUnicode_IS_ASCII(writer.buffer))
|
if (PyUnicode_IS_ASCII(writer.buffer))
|
||||||
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
|
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
|
||||||
|
@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||||
|
error_handler = get_error_handler(errors);
|
||||||
|
|
||||||
|
switch (error_handler) {
|
||||||
|
case _Py_ERROR_IGNORE:
|
||||||
|
s += (endinpos - startinpos);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case _Py_ERROR_REPLACE:
|
||||||
|
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
|
||||||
|
goto onError;
|
||||||
|
s += (endinpos - startinpos);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case _Py_ERROR_SURROGATEESCAPE:
|
||||||
|
if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
|
||||||
|
goto onError;
|
||||||
|
for (Py_ssize_t i=startinpos; i<endinpos; i++) {
|
||||||
|
ch = (Py_UCS4)(unsigned char)(starts[i]);
|
||||||
|
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
|
||||||
|
ch + 0xdc00);
|
||||||
|
writer.pos++;
|
||||||
|
}
|
||||||
|
s += (endinpos - startinpos);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
if (unicode_decode_call_errorhandler_writer(
|
if (unicode_decode_call_errorhandler_writer(
|
||||||
errors, &errorHandler,
|
errors, &error_handler_obj,
|
||||||
"utf-8", errmsg,
|
"utf-8", errmsg,
|
||||||
&starts, &end, &startinpos, &endinpos, &exc, &s,
|
&starts, &end, &startinpos, &endinpos, &exc, &s,
|
||||||
&writer))
|
&writer))
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
End:
|
End:
|
||||||
if (consumed)
|
if (consumed)
|
||||||
*consumed = s - starts;
|
*consumed = s - starts;
|
||||||
|
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(error_handler_obj);
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
return _PyUnicodeWriter_Finish(&writer);
|
return _PyUnicodeWriter_Finish(&writer);
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(error_handler_obj);
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
_PyUnicodeWriter_Dealloc(&writer);
|
_PyUnicodeWriter_Dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue