Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error

handlers: ``ignore``, ``replace`` and ``surrogateescape``.
2025-09-26 18:29:57 +00:00 · 2015-10-05 13:43:50 +02:00 · 2015-10-05 13:43:50 +02:00 · 1d65d9192d
commit 1d65d9192d
parent 7dbe6dd963
4 changed files with 57 additions and 9 deletions
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@ -123,6 +123,9 @@ Optimizations
 * The UTF-8 encoder is now up to 75 times as fast for error handlers:
  ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``.
 * The UTF-8 decoder is now up to 15 times as fast for error handlers:
  ``ignore``, ``replace`` and ``surrogateescape``.
 Build and C API Changes
 =======================
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase):
        self.check_state_handling_decode(self.encoding,
                                         u, u.encode(self.encoding))
    def test_decode_error(self):
        for data, error_handler, expected in (
            (b'[\x80\xff]', 'ignore', '[]'),
            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
        ):
            with self.subTest(data=data, error_handler=error_handler,
                              expected=expected):
                self.assertEqual(data.decode(self.encoding, error_handler),
                                 expected)
    def test_lone_surrogates(self):
        super().test_lone_surrogates()
        # not sure if this is making sense for
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
 Core and Builtins
 -----------------
 * Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error
  handlers: ``ignore``, ``replace`` and ``surrogateescape``.
 - Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
 - Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
    PyObject *exc = NULL;
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
    if (size == 0) {
        if (consumed)
@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
    while (s < end) {
        Py_UCS4 ch;
        int kind = writer.kind;
        if (kind == PyUnicode_1BYTE_KIND) {
            if (PyUnicode_IS_ASCII(writer.buffer))
                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
            continue;
        }
        if (error_handler == _Py_ERROR_UNKNOWN)
            error_handler = get_error_handler(errors);
        switch (error_handler) {
        case _Py_ERROR_IGNORE:
            s += (endinpos - startinpos);
            break;
        case _Py_ERROR_REPLACE:
            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
                goto onError;
            s += (endinpos - startinpos);
            break;
        case _Py_ERROR_SURROGATEESCAPE:
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
                goto onError;
            for (Py_ssize_t i=startinpos; i<endinpos; i++) {
                ch = (Py_UCS4)(unsigned char)(starts[i]);
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
                                ch + 0xdc00);
                writer.pos++;
            }
            s += (endinpos - startinpos);
            break;
        default:
            if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
+                    errors, &error_handler_obj,
                    "utf-8", errmsg,
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
                    &writer))
                goto onError;
        }
    }
 End:
    if (consumed)
        *consumed = s - starts;
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
    Py_XDECREF(exc);
    return _PyUnicodeWriter_Finish(&writer);
 onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
    Py_XDECREF(exc);
    _PyUnicodeWriter_Dealloc(&writer);
    return NULL;