Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy Storchaka.

2025-11-03 11:23:31 +00:00 · 2012-05-10 16:36:02 +02:00 · 2012-05-10 16:36:02 +02:00 · ca5f91b888
commit ca5f91b888
parent fda08b0860
8 changed files with 336 additions and 572 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
 Core and Builtins
 -----------------
 - Issue #14738: Speed-up UTF-8 decoding on non-ASCII data.  Patch by Serhiy
  Storchaka.
 - Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
  in old-style string formatting.
--- a/Objects/stringlib/asciilib.h
+++ b/Objects/stringlib/asciilib.h
@ -7,6 +7,7 @@
 #define STRINGLIB(F)             asciilib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    1
 #define STRINGLIB_MAX_CHAR       0x7Fu
 #define STRINGLIB_CHAR           Py_UCS1
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@ -15,19 +15,18 @@
 # error C 'long' size should be either 4 or 8!
 #endif
-Py_LOCAL_INLINE(int)
+Py_LOCAL_INLINE(Py_UCS4)
-STRINGLIB(utf8_try_decode)(const char *start, const char *end,
+STRINGLIB(utf8_decode)(const char **inptr, const char *end,
-                           STRINGLIB_CHAR *dest,
+                       STRINGLIB_CHAR *dest,
-                           const char **src_pos, Py_ssize_t *dest_index)
+                       Py_ssize_t *outpos)
 {
-    int ret;
+    Py_UCS4 ch;
-    Py_ssize_t n;
+    const char *s = *inptr;
    const char *s = start;
    const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
-    STRINGLIB_CHAR *p = dest;
+    STRINGLIB_CHAR *p = dest + *outpos;
    while (s < end) {
-        Py_UCS4 ch = (unsigned char)*s;
+        ch = (unsigned char)*s;
        if (ch < 0x80) {
            /* Fast path for runs of ASCII characters. Given that common UTF-8
@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
                    unsigned long value = *(unsigned long *) _s;
                    if (value & ASCII_CHAR_MASK)
                        break;
-                    _p[0] = _s[0];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-                    _p[1] = _s[1];
+                    _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
-                    _p[2] = _s[2];
+                    _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
-                    _p[3] = _s[3];
+                    _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
-#if (SIZEOF_LONG == 8)
+                    _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
-                    _p[4] = _s[4];
+# if SIZEOF_LONG == 8
-                    _p[5] = _s[5];
+                    _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
-                    _p[6] = _s[6];
+                    _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
-                    _p[7] = _s[7];
+                    _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
                    _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
 # endif
 #else
 # if SIZEOF_LONG == 8
                    _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
                    _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
                    _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
                    _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
                    _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
                    _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
                    _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
                    _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
 # else
                    _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
                    _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
                    _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
                    _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
 # endif
 #endif
                    _s += SIZEOF_LONG;
                    _p += SIZEOF_LONG;
@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
                    break;
                ch = (unsigned char)*s;
            }
            if (ch < 0x80) {
                s++;
                *p++ = ch;
                continue;
            }
        }
-        if (ch < 0x80) {
+        if (ch < 0xC2) {
-            s++;
+            /* invalid sequence
               \x80-\xBF -- continuation byte
               \xC0-\xC1 -- fake 0000-007F */
            goto InvalidStart;
        }
        if (ch < 0xE0) {
            /* \xC2\x80-\xDF\xBF -- 0080-07FF */
            Py_UCS4 ch2;
            if (end - s < 2) {
                /* unexpected end of data: the caller will decide whether
                   it's an error or not */
                break;
            }
            ch2 = (unsigned char)s[1];
            if ((ch2 & 0xC0) != 0x80)
                /* invalid continuation byte */
                goto InvalidContinuation;
            ch = (ch << 6) + ch2 -
                 ((0xC0 << 6) + 0x80);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            s += 2;
            if (STRINGLIB_MAX_CHAR <= 0x007F ||
                (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
                goto Overflow;
            *p++ = ch;
            continue;
        }
-        n = utf8_code_length[ch];
+        if (ch < 0xF0) {
-
+            /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
-        if (s + n > end) {
+            Py_UCS4 ch2, ch3;
-            /* unexpected end of data: the caller will decide whether
+            if (end - s < 3) {
-               it's an error or not */
+                /* unexpected end of data: the caller will decide whether
-            goto _error;
+                   it's an error or not */
-        }
+                break;
        switch (n) {
        case 0:
            /* invalid start byte */
            goto _error;
        case 1:
            /* internal error */
            goto _error;
        case 2:
            if ((s[1] & 0xc0) != 0x80)
                /* invalid continuation byte */
                goto _error;
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            s += 2;
            *p++ = ch;
            break;
        case 3:
            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
               will result in surrogates in range d800-dfff. Surrogates are
               not valid UTF-8 so they are rejected.
               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xE0 &&
                 (unsigned char)s[1] < 0xA0) ||
                ((unsigned char)s[0] == 0xED &&
                 (unsigned char)s[1] > 0x9F)) {
                /* invalid continuation byte */
                goto _error;
            }
-            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+            ch2 = (unsigned char)s[1];
            ch3 = (unsigned char)s[2];
            if ((ch2 & 0xC0) != 0x80 ||
                (ch3 & 0xC0) != 0x80) {
                /* invalid continuation byte */
                goto InvalidContinuation;
            }
            if (ch == 0xE0) {
                if (ch2 < 0xA0)
                    /* invalid sequence
                       \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
                    goto InvalidContinuation;
            }
            else if (ch == 0xED && ch2 > 0x9F) {
                /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
                   will result in surrogates in range D800-DFFF. Surrogates are
                   not valid UTF-8 so they are rejected.
                   See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
                   (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
                goto InvalidContinuation;
            }
            ch = (ch << 12) + (ch2 << 6) + ch3 -
                 ((0xE0 << 12) + (0x80 << 6) + 0x80);
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            s += 3;
            if (STRINGLIB_MAX_CHAR <= 0x07FF ||
                (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
                goto Overflow;
            *p++ = ch;
-            break;
+            continue;
        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                (s[3] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xF0 &&
                 (unsigned char)s[1] < 0x90) ||
                ((unsigned char)s[0] == 0xF4 &&
                 (unsigned char)s[1] > 0x8F)) {
                /* invalid continuation byte */
                goto _error;
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
            s += 4;
            *p++ = ch;
            break;
        }
        if (ch < 0xF5) {
            /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
            Py_UCS4 ch2, ch3, ch4;
            if (end - s < 4) {
                /* unexpected end of data: the caller will decide whether
                   it's an error or not */
                break;
            }
            ch2 = (unsigned char)s[1];
            ch3 = (unsigned char)s[2];
            ch4 = (unsigned char)s[3];
            if ((ch2 & 0xC0) != 0x80 ||
                (ch3 & 0xC0) != 0x80 ||
                (ch4 & 0xC0) != 0x80) {
                /* invalid continuation byte */
                goto InvalidContinuation;
            }
            if (ch == 0xF0) {
                if (ch2 < 0x90)
                    /* invalid sequence
                       \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
                    goto InvalidContinuation;
            }
            else if (ch == 0xF4 && ch2 > 0x8F) {
                /* invalid sequence
                   \xF4\x90\x80\80- -- 110000- overflow */
                goto InvalidContinuation;
            }
            ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
                 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
            assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
            s += 4;
            if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
                (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
                goto Overflow;
            *p++ = ch;
            continue;
        }
        goto InvalidStart;
    }
-    ret = 0;
+    ch = 0;
-    goto _ok;
+Overflow:
-_error:
+Return:
-    ret = -1;
+    *inptr = s;
-_ok:
+    *outpos = p - dest;
-    *src_pos = s;
+    return ch;
-    *dest_index = p - dest;
+InvalidStart:
-    return ret;
+    ch = 1;
    goto Return;
 InvalidContinuation:
    ch = 2;
    goto Return;
 }
 #undef LONG_PTR_MASK
--- a/Objects/stringlib/ucs1lib.h
+++ b/Objects/stringlib/ucs1lib.h
@ -7,6 +7,7 @@
 #define STRINGLIB(F)             ucs1lib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    1
 #define STRINGLIB_MAX_CHAR       0xFFu
 #define STRINGLIB_CHAR           Py_UCS1
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
--- a/Objects/stringlib/ucs2lib.h
+++ b/Objects/stringlib/ucs2lib.h
@ -7,6 +7,7 @@
 #define STRINGLIB(F)             ucs2lib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    2
 #define STRINGLIB_MAX_CHAR       0xFFFFu
 #define STRINGLIB_CHAR           Py_UCS2
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
--- a/Objects/stringlib/ucs4lib.h
+++ b/Objects/stringlib/ucs4lib.h
@ -7,6 +7,7 @@
 #define STRINGLIB(F)             ucs4lib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    4
 #define STRINGLIB_MAX_CHAR       0x10FFFFu
 #define STRINGLIB_CHAR           Py_UCS4
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
--- a/Objects/stringlib/undef.h
+++ b/Objects/stringlib/undef.h
@ -1,6 +1,7 @@
 #undef  FASTSEARCH
 #undef  STRINGLIB
 #undef  STRINGLIB_SIZEOF_CHAR
 #undef  STRINGLIB_MAX_CHAR
 #undef  STRINGLIB_CHAR
 #undef  STRINGLIB_STR
 #undef  STRINGLIB_LEN
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4615,28 +4615,6 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,
 /* --- UTF-8 Codec -------------------------------------------------------- */
 static
 char utf8_code_length[256] = {
    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
       illegal prefix.  See RFC 3629 for details */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
 };
 PyObject *
 PyUnicode_DecodeUTF8(const char *s,
                     Py_ssize_t size,
@ -4645,6 +4623,10 @@ PyUnicode_DecodeUTF8(const char *s,
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }
 #include "stringlib/asciilib.h"
 #include "stringlib/codecs.h"
 #include "stringlib/undef.h"
 #include "stringlib/ucs1lib.h"
 #include "stringlib/codecs.h"
 #include "stringlib/undef.h"
@ -4670,310 +4652,60 @@ PyUnicode_DecodeUTF8(const char *s,
 # error C 'long' size should be either 4 or 8!
 #endif
-/* Scans a UTF-8 string and returns the maximum character to be expected
+static Py_ssize_t
-   and the size of the decoded unicode string.
+ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
   This function doesn't check for errors, these checks are performed in
   PyUnicode_DecodeUTF8Stateful.
   */
 static Py_UCS4
 utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
 {
-    Py_ssize_t char_count = 0;
+    const char *p = start;
-    const unsigned char *end = p + string_size;
+    const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
-    assert(unicode_size != NULL);
+#if SIZEOF_LONG <= SIZEOF_VOID_P
-
+    assert(!((size_t) dest & LONG_PTR_MASK));
-    /* By having a cascade of independent loops which fallback onto each
+    if (!((size_t) p & LONG_PTR_MASK)) {
-       other, we minimize the amount of work done in the average loop
+        /* Fast path, see in STRINGLIB(utf8_decode) for
-       iteration, and we also maximize the CPU's ability to predict
+           an explanation. */
-       branches correctly (because a given condition will have always the
+        /* Help register allocation */
-       same boolean outcome except perhaps in the last iteration of the
+        register const char *_p = p;
-       corresponding loop).
+        register Py_UCS1 * q = dest;
-       In the general case this brings us rather close to decoding
+        while (_p < aligned_end) {
-       performance pre-PEP 393, despite the two-pass decoding.
+            unsigned long value = *(const unsigned long *) _p;
-
+            if (value & ASCII_CHAR_MASK)
       Note that the pure ASCII loop is not duplicated once a non-ASCII
       character has been encountered. It is actually a pessimization (by
       a significant factor) to use this loop on text with many non-ASCII
       characters, and it is important to avoid bad performance on valid
       utf-8 data (invalid utf-8 being a different can of worms).
    */
    /* ASCII */
    for (; p < end; ++p) {
        /* Only check value if it's not a ASCII char... */
        if (*p < 0x80) {
            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
               an explanation. */
            if (!((size_t) p & LONG_PTR_MASK)) {
                /* Help register allocation */
                register const unsigned char *_p = p;
                while (_p < aligned_end) {
                    unsigned long value = *(unsigned long *) _p;
                    if (value & ASCII_CHAR_MASK)
                        break;
                    _p += SIZEOF_LONG;
                    char_count += SIZEOF_LONG;
                }
                p = _p;
                if (p == end)
                    break;
            }
        }
        if (*p < 0x80)
            ++char_count;
        else
            goto _ucs1loop;
    }
    *unicode_size = char_count;
    return 127;
 _ucs1loop:
    for (; p < end; ++p) {
        if (*p < 0xc4)
            char_count += ((*p & 0xc0) != 0x80);
        else
            goto _ucs2loop;
    }
    *unicode_size = char_count;
    return 255;
 _ucs2loop:
    for (; p < end; ++p) {
        if (*p < 0xf0)
            char_count += ((*p & 0xc0) != 0x80);
        else
            goto _ucs4loop;
    }
    *unicode_size = char_count;
    return 65535;
 _ucs4loop:
    for (; p < end; ++p) {
        char_count += ((*p & 0xc0) != 0x80);
    }
    *unicode_size = char_count;
    return 65537;
 }
 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
   in case of errors. Implicit parameters: unicode, kind, data, onError.
   Potential resizing overallocates, so the result needs to shrink at the end.
 */
 #define WRITE_MAYBE_FAIL(index, value)                              \
    do {                                                            \
        Py_ssize_t pos = index;                                     \
        if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
            unicode_resize(&unicode, pos + pos/8) < 0)              \
            goto onError;                                           \
        if (unicode_putchar(&unicode, &pos, value) < 0)             \
            goto onError;                                           \
    } while (0)
 static PyObject *
 decode_utf8_errors(const char *starts,
                   Py_ssize_t size,
                   const char *errors,
                   Py_ssize_t *consumed,
                   const char *s,
                   PyObject *unicode,
                   Py_ssize_t i)
 {
    int n;
    int k;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    const char *e = starts + size;
    const char *aligned_end;
    const char *errmsg = "";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
    while (s < e) {
        Py_UCS4 ch = (unsigned char)*s;
        if (ch < 0x80) {
            /* Fast path for runs of ASCII characters. Given that common UTF-8
               input will consist of an overwhelming majority of ASCII
               characters, we try to optimize for this case by checking
               as many characters as a C 'long' can contain.
               First, check if we can do an aligned read, as most CPUs have
               a penalty for unaligned reads.
            */
            if (!((size_t) s & LONG_PTR_MASK)) {
                /* Help register allocation */
                register const char *_s = s;
                register Py_ssize_t _i = i;
                while (_s < aligned_end) {
                    /* Read a whole long at a time (either 4 or 8 bytes),
                       and do a fast unrolled copy if it only contains ASCII
                       characters. */
                    unsigned long value = *(unsigned long *) _s;
                    if (value & ASCII_CHAR_MASK)
                        break;
                    WRITE_MAYBE_FAIL(_i+0, _s[0]);
                    WRITE_MAYBE_FAIL(_i+1, _s[1]);
                    WRITE_MAYBE_FAIL(_i+2, _s[2]);
                    WRITE_MAYBE_FAIL(_i+3, _s[3]);
 #if (SIZEOF_LONG == 8)
                    WRITE_MAYBE_FAIL(_i+4, _s[4]);
                    WRITE_MAYBE_FAIL(_i+5, _s[5]);
                    WRITE_MAYBE_FAIL(_i+6, _s[6]);
                    WRITE_MAYBE_FAIL(_i+7, _s[7]);
 #endif
                    _s += SIZEOF_LONG;
                    _i += SIZEOF_LONG;
                }
                s = _s;
                i = _i;
                if (s == e)
                    break;
                ch = (unsigned char)*s;
            }
        }
        if (ch < 0x80) {
            WRITE_MAYBE_FAIL(i++, ch);
            s++;
            continue;
        }
        n = utf8_code_length[ch];
        if (s + n > e) {
            if (consumed)
                break;
-            else {
+            *((unsigned long *)q) = value;
-                errmsg = "unexpected end of data";
+            _p += SIZEOF_LONG;
-                startinpos = s-starts;
+            q += SIZEOF_LONG;
                endinpos = startinpos+1;
                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
                    endinpos++;
                goto utf8Error;
            }
        }
-
+        p = _p;
-        switch (n) {
+        while (p < end) {
-
+            if ((unsigned char)*p & 0x80)
-        case 0:
+                break;
-            errmsg = "invalid start byte";
+            *q++ = *p++;
            startinpos = s-starts;
            endinpos = startinpos+1;
            goto utf8Error;
        case 1:
            errmsg = "internal error";
            startinpos = s-starts;
            endinpos = startinpos+1;
            goto utf8Error;
        case 2:
            if ((s[1] & 0xc0) != 0x80) {
                errmsg = "invalid continuation byte";
                startinpos = s-starts;
                endinpos = startinpos + 1;
                goto utf8Error;
            }
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            WRITE_MAYBE_FAIL(i++, ch);
            break;
        case 3:
            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
               will result in surrogates in range d800-dfff. Surrogates are
               not valid UTF-8 so they are rejected.
               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xE0 &&
                 (unsigned char)s[1] < 0xA0) ||
                ((unsigned char)s[0] == 0xED &&
                 (unsigned char)s[1] > 0x9F)) {
                errmsg = "invalid continuation byte";
                startinpos = s-starts;
                endinpos = startinpos + 1;
                /* if s[1] first two bits are 1 and 0, then the invalid
                   continuation byte is s[2], so increment endinpos by 1,
                   if not, s[1] is invalid and endinpos doesn't need to
                   be incremented. */
                if ((s[1] & 0xC0) == 0x80)
                    endinpos++;
                goto utf8Error;
            }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            WRITE_MAYBE_FAIL(i++, ch);
            break;
        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                (s[3] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xF0 &&
                 (unsigned char)s[1] < 0x90) ||
                ((unsigned char)s[0] == 0xF4 &&
                 (unsigned char)s[1] > 0x8F)) {
                errmsg = "invalid continuation byte";
                startinpos = s-starts;
                endinpos = startinpos + 1;
                if ((s[1] & 0xC0) == 0x80) {
                    endinpos++;
                    if ((s[2] & 0xC0) == 0x80)
                        endinpos++;
                }
                goto utf8Error;
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
            WRITE_MAYBE_FAIL(i++, ch);
            break;
        }
-        s += n;
+        return p - start;
        continue;
      utf8Error:
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf-8", errmsg,
                &starts, &e, &startinpos, &endinpos, &exc, &s,
                &unicode, &i))
            goto onError;
        /* Update data because unicode_decode_call_errorhandler might have
           re-created or resized the unicode object. */
        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
    }
-    if (consumed)
+#endif
-        *consumed = s-starts;
+    while (p < end) {
-
+        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
-    /* Adjust length and ready string when it contained errors and
+           for an explanation. */
-       is of the old resizable kind. */
+        if (!((size_t) p & LONG_PTR_MASK)) {
-    if (unicode_resize(&unicode, i) < 0)
+            /* Help register allocation */
-        goto onError;
+            register const char *_p = p;
-    unicode_adjust_maxchar(&unicode);
+            while (_p < aligned_end) {
-    if (unicode == NULL)
+                unsigned long value = *(unsigned long *) _p;
-        goto onError;
+                if (value & ASCII_CHAR_MASK)
-
+                    break;
-    Py_XDECREF(errorHandler);
+                _p += SIZEOF_LONG;
-    Py_XDECREF(exc);
+            }
-    assert(_PyUnicode_CheckConsistency(unicode, 1));
+            p = _p;
-    return unicode;
+            if (_p == end)
-
+                break;
-  onError:
+        }
-    Py_XDECREF(errorHandler);
+        if ((unsigned char)*p & 0x80)
-    Py_XDECREF(exc);
+            break;
-    Py_XDECREF(unicode);
+        ++p;
-    return NULL;
+    }
    memcpy(dest, start, p - start);
    return p - start;
 }
 #undef WRITE_MAYBE_FAIL
 PyObject *
 PyUnicode_DecodeUTF8Stateful(const char *s,
@ -4981,15 +4713,16 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
                             const char *errors,
                             Py_ssize_t *consumed)
 {
    Py_UCS4 maxchar = 0;
    Py_ssize_t unicode_size;
    int has_errors = 0;
    PyObject *unicode;
    int kind;
    void *data;
    const char *starts = s;
-    const char *e;
+    const char *end = s + size;
-    Py_ssize_t i;
+    Py_ssize_t outpos;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    const char *errmsg = "";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    if (size == 0) {
        if (consumed)
@ -4998,49 +4731,91 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
        return unicode_empty;
    }
-    maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
+    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
-
+    if (size == 1 && (unsigned char)s[0] < 128) {
    /* When the string is ASCII only, just use memcpy and return.
       unicode_size may be != size if there is an incomplete UTF-8
       sequence at the end of the ASCII block.  */
    if (maxchar < 128 && size == unicode_size) {
        if (consumed)
-            *consumed = size;
+            *consumed = 1;
-        return unicode_fromascii((const unsigned char *)s, size);
+        return get_latin1_char((unsigned char)s[0]);
    }
-    unicode = PyUnicode_New(unicode_size, maxchar);
+    unicode = PyUnicode_New(size, 127);
    if (!unicode)
        return NULL;
    kind = PyUnicode_KIND(unicode);
    data = PyUnicode_DATA(unicode);
-    /* Unpack UTF-8 encoded data */
+    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
-    i = 0;
+    s += outpos;
-    e = starts + size;
+    while (s < end) {
-    switch (kind) {
+        Py_UCS4 ch;
-    case PyUnicode_1BYTE_KIND:
+        int kind = PyUnicode_KIND(unicode);
-        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
+        if (kind == PyUnicode_1BYTE_KIND) {
-        break;
+            if (PyUnicode_IS_ASCII(unicode))
-    case PyUnicode_2BYTE_KIND:
+                ch = asciilib_utf8_decode(&s, end,
-        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
+                        PyUnicode_1BYTE_DATA(unicode), &outpos);
-        break;
+            else
-    case PyUnicode_4BYTE_KIND:
+                ch = ucs1lib_utf8_decode(&s, end,
-        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
+                        PyUnicode_1BYTE_DATA(unicode), &outpos);
-        break;
+        } else if (kind == PyUnicode_2BYTE_KIND) {
-    }
+            ch = ucs2lib_utf8_decode(&s, end,
-    if (!has_errors) {
+                    PyUnicode_2BYTE_DATA(unicode), &outpos);
-        /* Ensure the unicode size calculation was correct */
+        } else {
-        assert(i == unicode_size);
+            assert(kind == PyUnicode_4BYTE_KIND);
-        assert(s == e);
+            ch = ucs4lib_utf8_decode(&s, end,
-        if (consumed)
+                    PyUnicode_4BYTE_DATA(unicode), &outpos);
-            *consumed = size;
+        }
-        return unicode;
+
        switch (ch) {
        case 0:
            if (s == end || consumed)
                goto End;
            errmsg = "unexpected end of data";
            startinpos = s - starts;
            endinpos = startinpos + 1;
            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
                endinpos++;
            break;
        case 1:
            errmsg = "invalid start byte";
            startinpos = s - starts;
            endinpos = startinpos + 1;
            break;
        case 2:
            errmsg = "invalid continuation byte";
            startinpos = s - starts;
            endinpos = startinpos + 1;
            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
                endinpos++;
            break;
        default:
            if (unicode_putchar(&unicode, &outpos, ch) < 0)
                goto onError;
            continue;
        }
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf-8", errmsg,
                &starts, &end, &startinpos, &endinpos, &exc, &s,
                &unicode, &outpos))
            goto onError;
    }
-    /* In case of errors, maxchar and size computation might be incorrect;
+End:
-       code below refits and resizes as necessary. */
+    if (unicode_resize(&unicode, outpos) < 0)
-    return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
+        goto onError;
    if (consumed)
        *consumed = s - starts;
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    assert(_PyUnicode_CheckConsistency(unicode, 1));
    return unicode;
 onError:
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    Py_XDECREF(unicode);
    return NULL;
 }
 #ifdef __APPLE__
@ -5051,9 +4826,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
 wchar_t*
 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
 {
    int n;
    const char *e;
-    wchar_t *unicode, *p;
+    wchar_t *unicode;
    Py_ssize_t outpos;
    /* Note: size will always be longer than the resulting Unicode
       character count */
@ -5066,86 +4841,33 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
        return NULL;
    /* Unpack UTF-8 encoded data */
    p = unicode;
    e = s + size;
    outpos = 0;
    while (s < e) {
-        Py_UCS4 ch = (unsigned char)*s;
+        Py_UCS4 ch;
        if (ch < 0x80) {
            *p++ = (wchar_t)ch;
            s++;
            continue;
        }
        n = utf8_code_length[ch];
        if (s + n > e) {
            goto surrogateescape;
        }
        switch (n) {
        case 0:
        case 1:
            goto surrogateescape;
        case 2:
            if ((s[1] & 0xc0) != 0x80)
                goto surrogateescape;
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            *p++ = (wchar_t)ch;
            break;
        case 3:
            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
               will result in surrogates in range d800-dfff. Surrogates are
               not valid UTF-8 so they are rejected.
               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xE0 &&
                 (unsigned char)s[1] < 0xA0) ||
                ((unsigned char)s[0] == 0xED &&
                 (unsigned char)s[1] > 0x9F)) {
                goto surrogateescape;
            }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            *p++ = (wchar_t)ch;
            break;
        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                (s[3] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xF0 &&
                 (unsigned char)s[1] < 0x90) ||
                ((unsigned char)s[0] == 0xF4 &&
                 (unsigned char)s[1] > 0x8F)) {
                goto surrogateescape;
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
 #if SIZEOF_WCHAR_T == 4
-            *p++ = (wchar_t)ch;
+        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
 #else
-            /*  compute and append the two surrogates: */
+        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
-            *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
+#endif
-            *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
+        if (ch > 0xFF) {
 #if SIZEOF_WCHAR_T == 4
            assert(0);
 #else
            assert(Py_UNICODE_IS_SURROGATE(ch));
            /*  compute and append the two surrogates: */
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
 #endif
            break;
        }
-        s += n;
+        else {
-        continue;
+            if (!ch && s == e)
-
+                break;
-      surrogateescape:
+            /* surrogateescape */
-        *p++ = 0xDC00 + ch;
+            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
-        s++;
+        }
    }
-    *p = L'\0';
+    unicode[outpos] = L'\0';
    return unicode;
 }
@ -6970,17 +6692,13 @@ PyUnicode_DecodeASCII(const char *s,
                      const char *errors)
 {
    const char *starts = s;
-    PyObject *v;
+    PyObject *unicode;
    int kind;
    void *data;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
    const char *e;
    int has_error;
    const unsigned char *p = (const unsigned char *)s;
    const unsigned char *end = p + size;
    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
@ -6993,45 +6711,18 @@ PyUnicode_DecodeASCII(const char *s,
    if (size == 1 && (unsigned char)s[0] < 128)
        return get_latin1_char((unsigned char)s[0]);
-    has_error = 0;
+    unicode = PyUnicode_New(size, 127);
-    while (p < end && !has_error) {
+    if (unicode == NULL)
        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
           an explanation. */
        if (!((size_t) p & LONG_PTR_MASK)) {
            /* Help register allocation */
            register const unsigned char *_p = p;
            while (_p < aligned_end) {
                unsigned long value = *(unsigned long *) _p;
                if (value & ASCII_CHAR_MASK) {
                    has_error = 1;
                    break;
                }
                _p += SIZEOF_LONG;
            }
            if (_p == end)
                break;
            if (has_error)
                break;
            p = _p;
        }
        if (*p & 0x80) {
            has_error = 1;
            break;
        }
        else {
            ++p;
        }
    }
    if (!has_error)
        return unicode_fromascii((const unsigned char *)s, size);
    v = PyUnicode_New(size, 127);
    if (v == NULL)
        goto onError;
-    kind = PyUnicode_KIND(v);
+
    data = PyUnicode_DATA(v);
    outpos = 0;
    e = s + size;
    data = PyUnicode_1BYTE_DATA(unicode);
    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
    if (outpos == size)
        return unicode;
    s += outpos;
    kind = PyUnicode_1BYTE_KIND;
    while (s < e) {
        register unsigned char c = (unsigned char)*s;
        if (c < 128) {
@ -7045,21 +6736,21 @@ PyUnicode_DecodeASCII(const char *s,
                    errors, &errorHandler,
                    "ascii", "ordinal not in range(128)",
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
-                    &v, &outpos))
+                    &unicode, &outpos))
                goto onError;
-            kind = PyUnicode_KIND(v);
+            kind = PyUnicode_KIND(unicode);
-            data = PyUnicode_DATA(v);
+            data = PyUnicode_DATA(unicode);
        }
    }
-    if (unicode_resize(&v, outpos) < 0)
+    if (unicode_resize(&unicode, outpos) < 0)
        goto onError;
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
-    assert(_PyUnicode_CheckConsistency(v, 1));
+    assert(_PyUnicode_CheckConsistency(unicode, 1));
-    return v;
+    return unicode;
  onError:
-    Py_XDECREF(v);
+    Py_XDECREF(unicode);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;