Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster

Patch written by Serhiy Storchaka.
2025-07-24 19:54:21 +00:00 · 2012-10-30 23:12:47 +01:00 · 2012-10-30 23:12:47 +01:00 · e64322e034
commit e64322e034
parent d4156c1693
3 changed files with 73 additions and 74 deletions
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@ -157,7 +157,7 @@ Optimizations
 Major performance enhancements have been added:
-* None yet.
+* The UTF-32 decoder is now 3x to 4x faster.
 Build and C API Changes
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
 Core and Builtins
 -----------------
 - Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
  written by Serhiy Storchaka.
 - Issue #16197: Update winreg docstrings and documentation to match code.
  Patch by Zachary Ware.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
    Py_ssize_t outpos;
    PyObject *unicode;
    const unsigned char *q, *e;
-    int bo = 0;       /* assume native ordering by default */
+    int le, bo = 0;       /* assume native ordering by default */
    const char *errmsg = "";
    /* Offsets from q for retrieving bytes in the right order. */
 #if PY_LITTLE_ENDIAN
    int iorder[] = {0, 1, 2, 3};
 #else
    int iorder[] = {3, 2, 1, 0};
 #endif
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
-    if (bo == 0) {
+    if (bo == 0 && size >= 4) {
-        if (size >= 4) {
+        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
                (q[iorder[1]] << 8) | q[iorder[0]];
 #if PY_LITTLE_ENDIAN
        if (bom == 0x0000FEFF) {
                q += 4;
            bo = -1;
            q += 4;
        }
        else if (bom == 0xFFFE0000) {
                q += 4;
            bo = 1;
            q += 4;
        }
        if (byteorder)
            *byteorder = bo;
    }
    if (q == e) {
        if (consumed)
            *consumed = size;
        Py_INCREF(unicode_empty);
        return unicode_empty;
    }
 #ifdef WORDS_BIGENDIAN
    le = bo < 0;
 #else
-            if (bom == 0x0000FEFF) {
+    le = bo <= 0;
                q += 4;
                bo = 1;
            }
            else if (bom == 0xFFFE0000) {
                q += 4;
                bo = -1;
            }
 #endif
        }
    }
-    if (bo == -1) {
+    unicode = PyUnicode_New((e - q + 3) / 4, 127);
        /* force LE */
        iorder[0] = 0;
        iorder[1] = 1;
        iorder[2] = 2;
        iorder[3] = 3;
    }
    else if (bo == 1) {
        /* force BE */
        iorder[0] = 3;
        iorder[1] = 2;
        iorder[2] = 1;
        iorder[3] = 0;
    }
    /* This might be one to much, because of a BOM */
    unicode = PyUnicode_New((size+3)/4, 127);
    if (!unicode)
        return NULL;
    if (size == 0)
        return unicode;
    outpos = 0;
-    while (q < e) {
+    outpos = 0;
-        Py_UCS4 ch;
+    while (1) {
-        /* remaining bytes at the end? (size should be divisible by 4) */
+        Py_UCS4 ch = 0;
-        if (e-q<4) {
+        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
-            if (consumed)
+
        if (e - q >= 4) {
            enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
            void *data = PyUnicode_DATA(unicode);
            const unsigned char *last = e - 4;
            if (le) {
                do {
                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
                    if (ch > maxch)
                        break;
                    PyUnicode_WRITE(kind, data, outpos++, ch);
                    q += 4;
                } while (q <= last);
            }
            else {
                do {
                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
                    if (ch > maxch)
                        break;
                    PyUnicode_WRITE(kind, data, outpos++, ch);
                    q += 4;
                } while (q <= last);
            }
        }
        if (ch <= maxch) {
            if (q == e || consumed)
                break;
            /* remaining bytes at the end? (size should be divisible by 4) */
            errmsg = "truncated data";
            startinpos = ((const char *)q) - starts;
            endinpos = ((const char *)e) - starts;
            goto utf32Error;
            /* The remaining input chars are ignored if the callback
               chooses to skip the input */
        }
        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
            (q[iorder[1]] << 8) | q[iorder[0]];
        if (ch >= 0x110000)
        {
            errmsg = "codepoint not in range(0x110000)";
            startinpos = ((const char *)q)-starts;
            endinpos = startinpos+4;
            goto utf32Error;
        }
        else {
            if (ch < 0x110000) {
                if (unicode_putchar(&unicode, &outpos, ch) < 0)
                    goto onError;
                q += 4;
                continue;
-      utf32Error:
+            }
            errmsg = "codepoint not in range(0x110000)";
            startinpos = ((const char *)q) - starts;
            endinpos = startinpos + 4;
        }
        /* The remaining input chars are ignored if the callback
           chooses to skip the input */
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf32", errmsg,
@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
            goto onError;
    }
    if (byteorder)
        *byteorder = bo;
    if (consumed)
        *consumed = (const char *)q-starts;