Move the slowest UTF-8 decoder to its own subfunction

* Create decode_utf8_errors() * Reuse unicode_fromascii() * decode_utf8_errors() doesn't refit at the beginning * Remove refit_partial_string(), use unicode_adjust_maxchar() instead
2025-10-20 21:52:10 +00:00 · 2011-12-11 20:09:03 +01:00 · 2011-12-11 20:09:03 +01:00 · 785938eebd
commit 785938eebd
parent 84def3774d
1 changed files with 98 additions and 128 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1784,7 +1784,7 @@ _PyUnicode_ClearStaticStrings()
 static PyObject*
 unicode_fromascii(const unsigned char* s, Py_ssize_t size)
 {
-    PyObject *res;
+    PyObject *unicode;
 #ifdef Py_DEBUG
    const unsigned char *p;
    const unsigned char *end = s + size;
@ -1794,11 +1794,12 @@ unicode_fromascii(const unsigned char* s, Py_ssize_t size)
 #endif
    if (size == 1)
        return get_latin1_char(s[0]);
-    res = PyUnicode_New(size, 127);
+    unicode = PyUnicode_New(size, 127);
-    if (!res)
+    if (!unicode)
        return NULL;
-    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
+    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
-    return res;
+    assert(_PyUnicode_CheckConsistency(unicode, 1));
    return unicode;
 }
 static Py_UCS4
@ -4320,126 +4321,38 @@ _ucs4loop:
    return 65537;
 }
 /* Called when we encountered some error that wasn't detected in the original
   scan, e.g. an encoded surrogate character. The original maxchar computation
   may have been incorrect, so redo it. */
 static int
 refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
 {
    PyObject *tmp;
    Py_ssize_t k;
    Py_UCS4 maxchar;
    for (k = 0, maxchar = 0; k < n; k++)
        maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
    tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
    if (tmp == NULL)
        return -1;
    PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
    Py_DECREF(*unicode);
    *unicode = tmp;
    return 0;
 }
 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
-   in case of errors. Implicit parameters: unicode, kind, data, has_errors,
+   in case of errors. Implicit parameters: unicode, kind, data, onError.
-   onError. Potential resizing overallocates, so the result needs to shrink
+   Potential resizing overallocates, so the result needs to shrink at the end.
   at the end.
 */
-#define WRITE_MAYBE_FAIL(index, value)                                  \
+#define WRITE_MAYBE_FAIL(index, value)                              \
-    do {                                                                \
+    do {                                                            \
-        if (has_errors) {                                               \
+        Py_ssize_t pos = index;                                     \
-            Py_ssize_t pos = index;                                     \
+        if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
-            if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
+            unicode_resize(&unicode, pos + pos/8) < 0)              \
-                unicode_resize(&unicode, pos + pos/8) < 0)              \
+            goto onError;                                           \
-                goto onError;                                           \
+        if (unicode_putchar(&unicode, &pos, value) < 0)             \
-            if (unicode_putchar(&unicode, &pos, value) < 0)             \
+            goto onError;                                           \
                goto onError;                                           \
        }                                                               \
        else                                                            \
            PyUnicode_WRITE(kind, data, index, value);                  \
    } while (0)
 PyObject *
-PyUnicode_DecodeUTF8Stateful(const char *s,
+decode_utf8_errors(const char *starts,
-                             Py_ssize_t size,
+                   Py_ssize_t size,
-                             const char *errors,
+                   const char *errors,
-                             Py_ssize_t *consumed)
+                   Py_ssize_t *consumed,
                   const char *s,
                   PyObject *unicode,
                   Py_ssize_t i)
 {
    const char *starts = s;
    int n;
    int k;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
-    const char *e, *aligned_end;
+    const char *e = starts + size;
-    PyObject *unicode;
+    const char *aligned_end;
    const char *errmsg = "";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    Py_UCS4 maxchar = 0;
    Py_ssize_t unicode_size;
    Py_ssize_t i;
    int kind;
    void *data;
    int has_errors = 0;
    if (size == 0) {
        if (consumed)
            *consumed = 0;
        return (PyObject *)PyUnicode_New(0, 0);
    }
    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
    /* When the string is ASCII only, just use memcpy and return.
       unicode_size may be != size if there is an incomplete UTF-8
       sequence at the end of the ASCII block.  */
    if (maxchar < 128 && size == unicode_size) {
        if (consumed)
            *consumed = size;
        if (size == 1)
            return get_latin1_char((unsigned char)s[0]);
        unicode = PyUnicode_New(unicode_size, maxchar);
        if (!unicode)
            return NULL;
        Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
        assert(_PyUnicode_CheckConsistency(unicode, 1));
        return unicode;
    }
    /* In case of errors, maxchar and size computation might be incorrect;
       code below refits and resizes as necessary. */
    unicode = PyUnicode_New(unicode_size, maxchar);
    if (!unicode)
        return NULL;
    kind = PyUnicode_KIND(unicode);
    data = PyUnicode_DATA(unicode);
    /* Unpack UTF-8 encoded data */
    i = 0;
    e = s + size;
    switch (kind) {
    case PyUnicode_1BYTE_KIND:
        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
        break;
    case PyUnicode_2BYTE_KIND:
        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
        break;
    case PyUnicode_4BYTE_KIND:
        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
        break;
    }
    if (!has_errors) {
        /* Ensure the unicode size calculation was correct */
        assert(i == unicode_size);
        assert(s == e);
        if (consumed)
            *consumed = s-starts;
        return unicode;
    }
    /* Fall through to the generic decoding loop for the rest of
       the string */
    if (refit_partial_string(&unicode, kind, data, i) < 0)
        goto onError;
    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
@ -4591,11 +4504,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
        continue;
      utf8Error:
        if (!has_errors) {
            if (refit_partial_string(&unicode, kind, data, i) < 0)
                goto onError;
            has_errors = 1;
        }
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf8", errmsg,
@ -4604,22 +4512,18 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
            goto onError;
        /* Update data because unicode_decode_call_errorhandler might have
           re-created or resized the unicode object. */
        data = PyUnicode_DATA(unicode);
        kind = PyUnicode_KIND(unicode);
        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
    }
    /* Ensure the unicode_size calculation above was correct: */
    assert(has_errors || i == unicode_size);
    if (consumed)
        *consumed = s-starts;
    /* Adjust length and ready string when it contained errors and
       is of the old resizable kind. */
-    if (has_errors) {
+    if (unicode_resize(&unicode, i) < 0)
-        if (PyUnicode_Resize(&unicode, i) < 0)
+        goto onError;
-            goto onError;
+    unicode_adjust_maxchar(&unicode);
-    }
+    if (unicode == NULL)
        goto onError;
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
@ -4629,12 +4533,78 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
  onError:
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
-    Py_DECREF(unicode);
+    Py_XDECREF(unicode);
    return NULL;
 }
 #undef WRITE_MAYBE_FAIL
 PyObject *
 PyUnicode_DecodeUTF8Stateful(const char *s,
                             Py_ssize_t size,
                             const char *errors,
                             Py_ssize_t *consumed)
 {
    Py_UCS4 maxchar = 0;
    Py_ssize_t unicode_size;
    int has_errors = 0;
    PyObject *unicode;
    int kind;
    void *data;
    const char *starts = s;
    const char *e;
    Py_ssize_t i;
    if (size == 0) {
        if (consumed)
            *consumed = 0;
        return (PyObject *)PyUnicode_New(0, 0);
    }
    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
    /* When the string is ASCII only, just use memcpy and return.
       unicode_size may be != size if there is an incomplete UTF-8
       sequence at the end of the ASCII block.  */
    if (maxchar < 128 && size == unicode_size) {
        if (consumed)
            *consumed = size;
        return unicode_fromascii(s, size);
    }
    unicode = PyUnicode_New(unicode_size, maxchar);
    if (!unicode)
        return NULL;
    kind = PyUnicode_KIND(unicode);
    data = PyUnicode_DATA(unicode);
    /* Unpack UTF-8 encoded data */
    i = 0;
    e = starts + size;
    switch (kind) {
    case PyUnicode_1BYTE_KIND:
        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
        break;
    case PyUnicode_2BYTE_KIND:
        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
        break;
    case PyUnicode_4BYTE_KIND:
        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
        break;
    }
    if (!has_errors) {
        /* Ensure the unicode size calculation was correct */
        assert(i == unicode_size);
        assert(s == e);
        if (consumed)
            *consumed = size;
        return unicode;
    }
    /* In case of errors, maxchar and size computation might be incorrect;
       code below refits and resizes as necessary. */
    return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
 }
 #ifdef __APPLE__
 /* Simplified UTF-8 decoder using surrogateescape error handler,